]>
Commit | Line | Data |
---|---|---|
3257aa99 DM |
1 | \r |
2 | """\r | |
3 | csv.py - read/write/investigate CSV files\r | |
4 | """\r | |
5 | \r | |
6 | import re\r | |
7 | from functools import reduce\r | |
8 | from _csv import Error, __version__, writer, reader, register_dialect, \\r | |
9 | unregister_dialect, get_dialect, list_dialects, \\r | |
10 | field_size_limit, \\r | |
11 | QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \\r | |
12 | __doc__\r | |
13 | from _csv import Dialect as _Dialect\r | |
14 | \r | |
15 | try:\r | |
16 | from cStringIO import StringIO\r | |
17 | except ImportError:\r | |
18 | from StringIO import StringIO\r | |
19 | \r | |
20 | __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",\r | |
21 | "Error", "Dialect", "__doc__", "excel", "excel_tab",\r | |
22 | "field_size_limit", "reader", "writer",\r | |
23 | "register_dialect", "get_dialect", "list_dialects", "Sniffer",\r | |
24 | "unregister_dialect", "__version__", "DictReader", "DictWriter" ]\r | |
25 | \r | |
26 | class Dialect:\r | |
27 | """Describe an Excel dialect.\r | |
28 | \r | |
29 | This must be subclassed (see csv.excel). Valid attributes are:\r | |
30 | delimiter, quotechar, escapechar, doublequote, skipinitialspace,\r | |
31 | lineterminator, quoting.\r | |
32 | \r | |
33 | """\r | |
34 | _name = ""\r | |
35 | _valid = False\r | |
36 | # placeholders\r | |
37 | delimiter = None\r | |
38 | quotechar = None\r | |
39 | escapechar = None\r | |
40 | doublequote = None\r | |
41 | skipinitialspace = None\r | |
42 | lineterminator = None\r | |
43 | quoting = None\r | |
44 | \r | |
45 | def __init__(self):\r | |
46 | if self.__class__ != Dialect:\r | |
47 | self._valid = True\r | |
48 | self._validate()\r | |
49 | \r | |
50 | def _validate(self):\r | |
51 | try:\r | |
52 | _Dialect(self)\r | |
53 | except TypeError, e:\r | |
54 | # We do this for compatibility with py2.3\r | |
55 | raise Error(str(e))\r | |
56 | \r | |
57 | class excel(Dialect):\r | |
58 | """Describe the usual properties of Excel-generated CSV files."""\r | |
59 | delimiter = ','\r | |
60 | quotechar = '"'\r | |
61 | doublequote = True\r | |
62 | skipinitialspace = False\r | |
63 | lineterminator = '\r\n'\r | |
64 | quoting = QUOTE_MINIMAL\r | |
65 | register_dialect("excel", excel)\r | |
66 | \r | |
67 | class excel_tab(excel):\r | |
68 | """Describe the usual properties of Excel-generated TAB-delimited files."""\r | |
69 | delimiter = '\t'\r | |
70 | register_dialect("excel-tab", excel_tab)\r | |
71 | \r | |
72 | \r | |
73 | class DictReader:\r | |
74 | def __init__(self, f, fieldnames=None, restkey=None, restval=None,\r | |
75 | dialect="excel", *args, **kwds):\r | |
76 | self._fieldnames = fieldnames # list of keys for the dict\r | |
77 | self.restkey = restkey # key to catch long rows\r | |
78 | self.restval = restval # default value for short rows\r | |
79 | self.reader = reader(f, dialect, *args, **kwds)\r | |
80 | self.dialect = dialect\r | |
81 | self.line_num = 0\r | |
82 | \r | |
83 | def __iter__(self):\r | |
84 | return self\r | |
85 | \r | |
86 | @property\r | |
87 | def fieldnames(self):\r | |
88 | if self._fieldnames is None:\r | |
89 | try:\r | |
90 | self._fieldnames = self.reader.next()\r | |
91 | except StopIteration:\r | |
92 | pass\r | |
93 | self.line_num = self.reader.line_num\r | |
94 | return self._fieldnames\r | |
95 | \r | |
96 | # Issue 20004: Because DictReader is a classic class, this setter is\r | |
97 | # ignored. At this point in 2.7's lifecycle, it is too late to change the\r | |
98 | # base class for fear of breaking working code. If you want to change\r | |
99 | # fieldnames without overwriting the getter, set _fieldnames directly.\r | |
100 | @fieldnames.setter\r | |
101 | def fieldnames(self, value):\r | |
102 | self._fieldnames = value\r | |
103 | \r | |
104 | def next(self):\r | |
105 | if self.line_num == 0:\r | |
106 | # Used only for its side effect.\r | |
107 | self.fieldnames\r | |
108 | row = self.reader.next()\r | |
109 | self.line_num = self.reader.line_num\r | |
110 | \r | |
111 | # unlike the basic reader, we prefer not to return blanks,\r | |
112 | # because we will typically wind up with a dict full of None\r | |
113 | # values\r | |
114 | while row == []:\r | |
115 | row = self.reader.next()\r | |
116 | d = dict(zip(self.fieldnames, row))\r | |
117 | lf = len(self.fieldnames)\r | |
118 | lr = len(row)\r | |
119 | if lf < lr:\r | |
120 | d[self.restkey] = row[lf:]\r | |
121 | elif lf > lr:\r | |
122 | for key in self.fieldnames[lr:]:\r | |
123 | d[key] = self.restval\r | |
124 | return d\r | |
125 | \r | |
126 | \r | |
127 | class DictWriter:\r | |
128 | def __init__(self, f, fieldnames, restval="", extrasaction="raise",\r | |
129 | dialect="excel", *args, **kwds):\r | |
130 | self.fieldnames = fieldnames # list of keys for the dict\r | |
131 | self.restval = restval # for writing short dicts\r | |
132 | if extrasaction.lower() not in ("raise", "ignore"):\r | |
133 | raise ValueError, \\r | |
134 | ("extrasaction (%s) must be 'raise' or 'ignore'" %\r | |
135 | extrasaction)\r | |
136 | self.extrasaction = extrasaction\r | |
137 | self.writer = writer(f, dialect, *args, **kwds)\r | |
138 | \r | |
139 | def writeheader(self):\r | |
140 | header = dict(zip(self.fieldnames, self.fieldnames))\r | |
141 | self.writerow(header)\r | |
142 | \r | |
143 | def _dict_to_list(self, rowdict):\r | |
144 | if self.extrasaction == "raise":\r | |
145 | wrong_fields = [k for k in rowdict if k not in self.fieldnames]\r | |
146 | if wrong_fields:\r | |
147 | raise ValueError("dict contains fields not in fieldnames: "\r | |
148 | + ", ".join([repr(x) for x in wrong_fields]))\r | |
149 | return [rowdict.get(key, self.restval) for key in self.fieldnames]\r | |
150 | \r | |
151 | def writerow(self, rowdict):\r | |
152 | return self.writer.writerow(self._dict_to_list(rowdict))\r | |
153 | \r | |
154 | def writerows(self, rowdicts):\r | |
155 | rows = []\r | |
156 | for rowdict in rowdicts:\r | |
157 | rows.append(self._dict_to_list(rowdict))\r | |
158 | return self.writer.writerows(rows)\r | |
159 | \r | |
160 | # Guard Sniffer's type checking against builds that exclude complex()\r | |
161 | try:\r | |
162 | complex\r | |
163 | except NameError:\r | |
164 | complex = float\r | |
165 | \r | |
166 | class Sniffer:\r | |
167 | '''\r | |
168 | "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)\r | |
169 | Returns a Dialect object.\r | |
170 | '''\r | |
171 | def __init__(self):\r | |
172 | # in case there is more than one possible delimiter\r | |
173 | self.preferred = [',', '\t', ';', ' ', ':']\r | |
174 | \r | |
175 | \r | |
176 | def sniff(self, sample, delimiters=None):\r | |
177 | """\r | |
178 | Returns a dialect (or None) corresponding to the sample\r | |
179 | """\r | |
180 | \r | |
181 | quotechar, doublequote, delimiter, skipinitialspace = \\r | |
182 | self._guess_quote_and_delimiter(sample, delimiters)\r | |
183 | if not delimiter:\r | |
184 | delimiter, skipinitialspace = self._guess_delimiter(sample,\r | |
185 | delimiters)\r | |
186 | \r | |
187 | if not delimiter:\r | |
188 | raise Error, "Could not determine delimiter"\r | |
189 | \r | |
190 | class dialect(Dialect):\r | |
191 | _name = "sniffed"\r | |
192 | lineterminator = '\r\n'\r | |
193 | quoting = QUOTE_MINIMAL\r | |
194 | # escapechar = ''\r | |
195 | \r | |
196 | dialect.doublequote = doublequote\r | |
197 | dialect.delimiter = delimiter\r | |
198 | # _csv.reader won't accept a quotechar of ''\r | |
199 | dialect.quotechar = quotechar or '"'\r | |
200 | dialect.skipinitialspace = skipinitialspace\r | |
201 | \r | |
202 | return dialect\r | |
203 | \r | |
204 | \r | |
205 | def _guess_quote_and_delimiter(self, data, delimiters):\r | |
206 | """\r | |
207 | Looks for text enclosed between two identical quotes\r | |
208 | (the probable quotechar) which are preceded and followed\r | |
209 | by the same character (the probable delimiter).\r | |
210 | For example:\r | |
211 | ,'some text',\r | |
212 | The quote with the most wins, same with the delimiter.\r | |
213 | If there is no quotechar the delimiter can't be determined\r | |
214 | this way.\r | |
215 | """\r | |
216 | \r | |
217 | matches = []\r | |
218 | for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",\r | |
219 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",\r | |
220 | '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"\r | |
221 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)\r | |
222 | regexp = re.compile(restr, re.DOTALL | re.MULTILINE)\r | |
223 | matches = regexp.findall(data)\r | |
224 | if matches:\r | |
225 | break\r | |
226 | \r | |
227 | if not matches:\r | |
228 | # (quotechar, doublequote, delimiter, skipinitialspace)\r | |
229 | return ('', False, None, 0)\r | |
230 | quotes = {}\r | |
231 | delims = {}\r | |
232 | spaces = 0\r | |
233 | for m in matches:\r | |
234 | n = regexp.groupindex['quote'] - 1\r | |
235 | key = m[n]\r | |
236 | if key:\r | |
237 | quotes[key] = quotes.get(key, 0) + 1\r | |
238 | try:\r | |
239 | n = regexp.groupindex['delim'] - 1\r | |
240 | key = m[n]\r | |
241 | except KeyError:\r | |
242 | continue\r | |
243 | if key and (delimiters is None or key in delimiters):\r | |
244 | delims[key] = delims.get(key, 0) + 1\r | |
245 | try:\r | |
246 | n = regexp.groupindex['space'] - 1\r | |
247 | except KeyError:\r | |
248 | continue\r | |
249 | if m[n]:\r | |
250 | spaces += 1\r | |
251 | \r | |
252 | quotechar = reduce(lambda a, b, quotes = quotes:\r | |
253 | (quotes[a] > quotes[b]) and a or b, quotes.keys())\r | |
254 | \r | |
255 | if delims:\r | |
256 | delim = reduce(lambda a, b, delims = delims:\r | |
257 | (delims[a] > delims[b]) and a or b, delims.keys())\r | |
258 | skipinitialspace = delims[delim] == spaces\r | |
259 | if delim == '\n': # most likely a file with a single column\r | |
260 | delim = ''\r | |
261 | else:\r | |
262 | # there is *no* delimiter, it's a single column of quoted data\r | |
263 | delim = ''\r | |
264 | skipinitialspace = 0\r | |
265 | \r | |
266 | # if we see an extra quote between delimiters, we've got a\r | |
267 | # double quoted format\r | |
268 | dq_regexp = re.compile(\r | |
269 | r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \\r | |
270 | {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)\r | |
271 | \r | |
272 | \r | |
273 | \r | |
274 | if dq_regexp.search(data):\r | |
275 | doublequote = True\r | |
276 | else:\r | |
277 | doublequote = False\r | |
278 | \r | |
279 | return (quotechar, doublequote, delim, skipinitialspace)\r | |
280 | \r | |
281 | \r | |
282 | def _guess_delimiter(self, data, delimiters):\r | |
283 | """\r | |
284 | The delimiter /should/ occur the same number of times on\r | |
285 | each row. However, due to malformed data, it may not. We don't want\r | |
286 | an all or nothing approach, so we allow for small variations in this\r | |
287 | number.\r | |
288 | 1) build a table of the frequency of each character on every line.\r | |
289 | 2) build a table of frequencies of this frequency (meta-frequency?),\r | |
290 | e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,\r | |
291 | 7 times in 2 rows'\r | |
292 | 3) use the mode of the meta-frequency to determine the /expected/\r | |
293 | frequency for that character\r | |
294 | 4) find out how often the character actually meets that goal\r | |
295 | 5) the character that best meets its goal is the delimiter\r | |
296 | For performance reasons, the data is evaluated in chunks, so it can\r | |
297 | try and evaluate the smallest portion of the data possible, evaluating\r | |
298 | additional chunks as necessary.\r | |
299 | """\r | |
300 | \r | |
301 | data = filter(None, data.split('\n'))\r | |
302 | \r | |
303 | ascii = [chr(c) for c in range(127)] # 7-bit ASCII\r | |
304 | \r | |
305 | # build frequency tables\r | |
306 | chunkLength = min(10, len(data))\r | |
307 | iteration = 0\r | |
308 | charFrequency = {}\r | |
309 | modes = {}\r | |
310 | delims = {}\r | |
311 | start, end = 0, min(chunkLength, len(data))\r | |
312 | while start < len(data):\r | |
313 | iteration += 1\r | |
314 | for line in data[start:end]:\r | |
315 | for char in ascii:\r | |
316 | metaFrequency = charFrequency.get(char, {})\r | |
317 | # must count even if frequency is 0\r | |
318 | freq = line.count(char)\r | |
319 | # value is the mode\r | |
320 | metaFrequency[freq] = metaFrequency.get(freq, 0) + 1\r | |
321 | charFrequency[char] = metaFrequency\r | |
322 | \r | |
323 | for char in charFrequency.keys():\r | |
324 | items = charFrequency[char].items()\r | |
325 | if len(items) == 1 and items[0][0] == 0:\r | |
326 | continue\r | |
327 | # get the mode of the frequencies\r | |
328 | if len(items) > 1:\r | |
329 | modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,\r | |
330 | items)\r | |
331 | # adjust the mode - subtract the sum of all\r | |
332 | # other frequencies\r | |
333 | items.remove(modes[char])\r | |
334 | modes[char] = (modes[char][0], modes[char][1]\r | |
335 | - reduce(lambda a, b: (0, a[1] + b[1]),\r | |
336 | items)[1])\r | |
337 | else:\r | |
338 | modes[char] = items[0]\r | |
339 | \r | |
340 | # build a list of possible delimiters\r | |
341 | modeList = modes.items()\r | |
342 | total = float(chunkLength * iteration)\r | |
343 | # (rows of consistent data) / (number of rows) = 100%\r | |
344 | consistency = 1.0\r | |
345 | # minimum consistency threshold\r | |
346 | threshold = 0.9\r | |
347 | while len(delims) == 0 and consistency >= threshold:\r | |
348 | for k, v in modeList:\r | |
349 | if v[0] > 0 and v[1] > 0:\r | |
350 | if ((v[1]/total) >= consistency and\r | |
351 | (delimiters is None or k in delimiters)):\r | |
352 | delims[k] = v\r | |
353 | consistency -= 0.01\r | |
354 | \r | |
355 | if len(delims) == 1:\r | |
356 | delim = delims.keys()[0]\r | |
357 | skipinitialspace = (data[0].count(delim) ==\r | |
358 | data[0].count("%c " % delim))\r | |
359 | return (delim, skipinitialspace)\r | |
360 | \r | |
361 | # analyze another chunkLength lines\r | |
362 | start = end\r | |
363 | end += chunkLength\r | |
364 | \r | |
365 | if not delims:\r | |
366 | return ('', 0)\r | |
367 | \r | |
368 | # if there's more than one, fall back to a 'preferred' list\r | |
369 | if len(delims) > 1:\r | |
370 | for d in self.preferred:\r | |
371 | if d in delims.keys():\r | |
372 | skipinitialspace = (data[0].count(d) ==\r | |
373 | data[0].count("%c " % d))\r | |
374 | return (d, skipinitialspace)\r | |
375 | \r | |
376 | # nothing else indicates a preference, pick the character that\r | |
377 | # dominates(?)\r | |
378 | items = [(v,k) for (k,v) in delims.items()]\r | |
379 | items.sort()\r | |
380 | delim = items[-1][1]\r | |
381 | \r | |
382 | skipinitialspace = (data[0].count(delim) ==\r | |
383 | data[0].count("%c " % delim))\r | |
384 | return (delim, skipinitialspace)\r | |
385 | \r | |
386 | \r | |
387 | def has_header(self, sample):\r | |
388 | # Creates a dictionary of types of data in each column. If any\r | |
389 | # column is of a single type (say, integers), *except* for the first\r | |
390 | # row, then the first row is presumed to be labels. If the type\r | |
391 | # can't be determined, it is assumed to be a string in which case\r | |
392 | # the length of the string is the determining factor: if all of the\r | |
393 | # rows except for the first are the same length, it's a header.\r | |
394 | # Finally, a 'vote' is taken at the end for each column, adding or\r | |
395 | # subtracting from the likelihood of the first row being a header.\r | |
396 | \r | |
397 | rdr = reader(StringIO(sample), self.sniff(sample))\r | |
398 | \r | |
399 | header = rdr.next() # assume first row is header\r | |
400 | \r | |
401 | columns = len(header)\r | |
402 | columnTypes = {}\r | |
403 | for i in range(columns): columnTypes[i] = None\r | |
404 | \r | |
405 | checked = 0\r | |
406 | for row in rdr:\r | |
407 | # arbitrary number of rows to check, to keep it sane\r | |
408 | if checked > 20:\r | |
409 | break\r | |
410 | checked += 1\r | |
411 | \r | |
412 | if len(row) != columns:\r | |
413 | continue # skip rows that have irregular number of columns\r | |
414 | \r | |
415 | for col in columnTypes.keys():\r | |
416 | \r | |
417 | for thisType in [int, long, float, complex]:\r | |
418 | try:\r | |
419 | thisType(row[col])\r | |
420 | break\r | |
421 | except (ValueError, OverflowError):\r | |
422 | pass\r | |
423 | else:\r | |
424 | # fallback to length of string\r | |
425 | thisType = len(row[col])\r | |
426 | \r | |
427 | # treat longs as ints\r | |
428 | if thisType == long:\r | |
429 | thisType = int\r | |
430 | \r | |
431 | if thisType != columnTypes[col]:\r | |
432 | if columnTypes[col] is None: # add new column type\r | |
433 | columnTypes[col] = thisType\r | |
434 | else:\r | |
435 | # type is inconsistent, remove column from\r | |
436 | # consideration\r | |
437 | del columnTypes[col]\r | |
438 | \r | |
439 | # finally, compare results against first row and "vote"\r | |
440 | # on whether it's a header\r | |
441 | hasHeader = 0\r | |
442 | for col, colType in columnTypes.items():\r | |
443 | if type(colType) == type(0): # it's a length\r | |
444 | if len(header[col]) != colType:\r | |
445 | hasHeader += 1\r | |
446 | else:\r | |
447 | hasHeader -= 1\r | |
448 | else: # attempt typecast\r | |
449 | try:\r | |
450 | colType(header[col])\r | |
451 | except (ValueError, TypeError):\r | |
452 | hasHeader += 1\r | |
453 | else:\r | |
454 | hasHeader -= 1\r | |
455 | \r | |
456 | return hasHeader > 0\r |