UserGuidelines/Importer: splitCSV.py

File splitCSV.py, 1.2 KB (added by Fran Boon, 11 years ago)

Spli a large CSV file into multiple files to avoid memory exhaustion when importing

Line 
1#!/bin/env python
2
3# Splits CSV files into smaller segments for more reliable importing
4
5# Number of rows per file
6size = 10000
7
8import sys
9
10try:
11 if len(sys.argv) > 1:
12 # Run as python scriptname xxx.csv
13 input = sys.argv[1]
14 else:
15 # Run as ./scriptname xxx.csv
16 input = sys.argv[0]
17except:
18 print "Specify CSV file as argument: python cleanCSV.py myfile.csv"
19 sys.exit(2)
20
21try:
22 prefix, extension = input.split(".", 1)
23except:
24 print "Invalid filename!"
25 sys.exit(2)
26
27if extension != "csv":
28 print "Input file should be xxx.csv!"
29 sys.exit(2)
30
31try:
32 inputFile = open(input, "r")
33except:
34 print "Cannot open file!"
35 sys.exit(2)
36
37header = None
38segment = 1
39line_number = 0
40start = True
41
42for line in inputFile:
43 if not header:
44 header = line
45 continue
46 if start:
47 output = "%s-%s.csv" % (prefix, segment)
48 outputFile = open(output, "w")
49 outputFile.write(header)
50 start = False
51 outputFile.write(line)
52 line_number += 1
53 if line_number == size:
54 outputFile.close()
55 segment += 1
56 line_number = 0
57 start = True
58
59inputFile.close()
60outputFile.close()