1 | #!/bin/env python
|
---|
2 |
|
---|
3 | # Splits CSV files into smaller segments for more reliable importing
|
---|
4 |
|
---|
5 | # Number of rows per file
|
---|
6 | size = 10000
|
---|
7 |
|
---|
8 | import sys
|
---|
9 |
|
---|
10 | try:
|
---|
11 | if len(sys.argv) > 1:
|
---|
12 | # Run as python scriptname xxx.csv
|
---|
13 | input = sys.argv[1]
|
---|
14 | else:
|
---|
15 | # Run as ./scriptname xxx.csv
|
---|
16 | input = sys.argv[0]
|
---|
17 | except:
|
---|
18 | print "Specify CSV file as argument: python cleanCSV.py myfile.csv"
|
---|
19 | sys.exit(2)
|
---|
20 |
|
---|
21 | try:
|
---|
22 | prefix, extension = input.split(".", 1)
|
---|
23 | except:
|
---|
24 | print "Invalid filename!"
|
---|
25 | sys.exit(2)
|
---|
26 |
|
---|
27 | if extension != "csv":
|
---|
28 | print "Input file should be xxx.csv!"
|
---|
29 | sys.exit(2)
|
---|
30 |
|
---|
31 | try:
|
---|
32 | inputFile = open(input, "r")
|
---|
33 | except:
|
---|
34 | print "Cannot open file!"
|
---|
35 | sys.exit(2)
|
---|
36 |
|
---|
37 | header = None
|
---|
38 | segment = 1
|
---|
39 | line_number = 0
|
---|
40 | start = True
|
---|
41 |
|
---|
42 | for line in inputFile:
|
---|
43 | if not header:
|
---|
44 | header = line
|
---|
45 | continue
|
---|
46 | if start:
|
---|
47 | output = "%s-%s.csv" % (prefix, segment)
|
---|
48 | outputFile = open(output, "w")
|
---|
49 | outputFile.write(header)
|
---|
50 | start = False
|
---|
51 | outputFile.write(line)
|
---|
52 | line_number += 1
|
---|
53 | if line_number == size:
|
---|
54 | outputFile.close()
|
---|
55 | segment += 1
|
---|
56 | line_number = 0
|
---|
57 | start = True
|
---|
58 |
|
---|
59 | inputFile.close()
|
---|
60 | outputFile.close()
|
---|