]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Lib/csv.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / csv.py
CommitLineData
4710c53d 1\r
2"""\r
3csv.py - read/write/investigate CSV files\r
4"""\r
5\r
6import re\r
7from functools import reduce\r
8from _csv import Error, __version__, writer, reader, register_dialect, \\r
9 unregister_dialect, get_dialect, list_dialects, \\r
10 field_size_limit, \\r
11 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \\r
12 __doc__\r
13from _csv import Dialect as _Dialect\r
14\r
15try:\r
16 from cStringIO import StringIO\r
17except ImportError:\r
18 from StringIO import StringIO\r
19\r
20__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",\r
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",\r
22 "field_size_limit", "reader", "writer",\r
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",\r
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]\r
25\r
26class Dialect:\r
27 """Describe an Excel dialect.\r
28\r
29 This must be subclassed (see csv.excel). Valid attributes are:\r
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,\r
31 lineterminator, quoting.\r
32\r
33 """\r
34 _name = ""\r
35 _valid = False\r
36 # placeholders\r
37 delimiter = None\r
38 quotechar = None\r
39 escapechar = None\r
40 doublequote = None\r
41 skipinitialspace = None\r
42 lineterminator = None\r
43 quoting = None\r
44\r
45 def __init__(self):\r
46 if self.__class__ != Dialect:\r
47 self._valid = True\r
48 self._validate()\r
49\r
50 def _validate(self):\r
51 try:\r
52 _Dialect(self)\r
53 except TypeError, e:\r
54 # We do this for compatibility with py2.3\r
55 raise Error(str(e))\r
56\r
57class excel(Dialect):\r
58 """Describe the usual properties of Excel-generated CSV files."""\r
59 delimiter = ','\r
60 quotechar = '"'\r
61 doublequote = True\r
62 skipinitialspace = False\r
63 lineterminator = '\r\n'\r
64 quoting = QUOTE_MINIMAL\r
65register_dialect("excel", excel)\r
66\r
67class excel_tab(excel):\r
68 """Describe the usual properties of Excel-generated TAB-delimited files."""\r
69 delimiter = '\t'\r
70register_dialect("excel-tab", excel_tab)\r
71\r
72\r
73class DictReader:\r
74 def __init__(self, f, fieldnames=None, restkey=None, restval=None,\r
75 dialect="excel", *args, **kwds):\r
76 self._fieldnames = fieldnames # list of keys for the dict\r
77 self.restkey = restkey # key to catch long rows\r
78 self.restval = restval # default value for short rows\r
79 self.reader = reader(f, dialect, *args, **kwds)\r
80 self.dialect = dialect\r
81 self.line_num = 0\r
82\r
83 def __iter__(self):\r
84 return self\r
85\r
86 @property\r
87 def fieldnames(self):\r
88 if self._fieldnames is None:\r
89 try:\r
90 self._fieldnames = self.reader.next()\r
91 except StopIteration:\r
92 pass\r
93 self.line_num = self.reader.line_num\r
94 return self._fieldnames\r
95\r
96 @fieldnames.setter\r
97 def fieldnames(self, value):\r
98 self._fieldnames = value\r
99\r
100 def next(self):\r
101 if self.line_num == 0:\r
102 # Used only for its side effect.\r
103 self.fieldnames\r
104 row = self.reader.next()\r
105 self.line_num = self.reader.line_num\r
106\r
107 # unlike the basic reader, we prefer not to return blanks,\r
108 # because we will typically wind up with a dict full of None\r
109 # values\r
110 while row == []:\r
111 row = self.reader.next()\r
112 d = dict(zip(self.fieldnames, row))\r
113 lf = len(self.fieldnames)\r
114 lr = len(row)\r
115 if lf < lr:\r
116 d[self.restkey] = row[lf:]\r
117 elif lf > lr:\r
118 for key in self.fieldnames[lr:]:\r
119 d[key] = self.restval\r
120 return d\r
121\r
122\r
123class DictWriter:\r
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise",\r
125 dialect="excel", *args, **kwds):\r
126 self.fieldnames = fieldnames # list of keys for the dict\r
127 self.restval = restval # for writing short dicts\r
128 if extrasaction.lower() not in ("raise", "ignore"):\r
129 raise ValueError, \\r
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %\r
131 extrasaction)\r
132 self.extrasaction = extrasaction\r
133 self.writer = writer(f, dialect, *args, **kwds)\r
134\r
135 def writeheader(self):\r
136 header = dict(zip(self.fieldnames, self.fieldnames))\r
137 self.writerow(header)\r
138\r
139 def _dict_to_list(self, rowdict):\r
140 if self.extrasaction == "raise":\r
141 wrong_fields = [k for k in rowdict if k not in self.fieldnames]\r
142 if wrong_fields:\r
143 raise ValueError("dict contains fields not in fieldnames: " +\r
144 ", ".join(wrong_fields))\r
145 return [rowdict.get(key, self.restval) for key in self.fieldnames]\r
146\r
147 def writerow(self, rowdict):\r
148 return self.writer.writerow(self._dict_to_list(rowdict))\r
149\r
150 def writerows(self, rowdicts):\r
151 rows = []\r
152 for rowdict in rowdicts:\r
153 rows.append(self._dict_to_list(rowdict))\r
154 return self.writer.writerows(rows)\r
155\r
156# Guard Sniffer's type checking against builds that exclude complex()\r
157try:\r
158 complex\r
159except NameError:\r
160 complex = float\r
161\r
162class Sniffer:\r
163 '''\r
164 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)\r
165 Returns a Dialect object.\r
166 '''\r
167 def __init__(self):\r
168 # in case there is more than one possible delimiter\r
169 self.preferred = [',', '\t', ';', ' ', ':']\r
170\r
171\r
172 def sniff(self, sample, delimiters=None):\r
173 """\r
174 Returns a dialect (or None) corresponding to the sample\r
175 """\r
176\r
177 quotechar, doublequote, delimiter, skipinitialspace = \\r
178 self._guess_quote_and_delimiter(sample, delimiters)\r
179 if not delimiter:\r
180 delimiter, skipinitialspace = self._guess_delimiter(sample,\r
181 delimiters)\r
182\r
183 if not delimiter:\r
184 raise Error, "Could not determine delimiter"\r
185\r
186 class dialect(Dialect):\r
187 _name = "sniffed"\r
188 lineterminator = '\r\n'\r
189 quoting = QUOTE_MINIMAL\r
190 # escapechar = ''\r
191\r
192 dialect.doublequote = doublequote\r
193 dialect.delimiter = delimiter\r
194 # _csv.reader won't accept a quotechar of ''\r
195 dialect.quotechar = quotechar or '"'\r
196 dialect.skipinitialspace = skipinitialspace\r
197\r
198 return dialect\r
199\r
200\r
201 def _guess_quote_and_delimiter(self, data, delimiters):\r
202 """\r
203 Looks for text enclosed between two identical quotes\r
204 (the probable quotechar) which are preceded and followed\r
205 by the same character (the probable delimiter).\r
206 For example:\r
207 ,'some text',\r
208 The quote with the most wins, same with the delimiter.\r
209 If there is no quotechar the delimiter can't be determined\r
210 this way.\r
211 """\r
212\r
213 matches = []\r
214 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",\r
215 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",\r
216 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"\r
217 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)\r
218 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)\r
219 matches = regexp.findall(data)\r
220 if matches:\r
221 break\r
222\r
223 if not matches:\r
224 # (quotechar, doublequote, delimiter, skipinitialspace)\r
225 return ('', False, None, 0)\r
226 quotes = {}\r
227 delims = {}\r
228 spaces = 0\r
229 for m in matches:\r
230 n = regexp.groupindex['quote'] - 1\r
231 key = m[n]\r
232 if key:\r
233 quotes[key] = quotes.get(key, 0) + 1\r
234 try:\r
235 n = regexp.groupindex['delim'] - 1\r
236 key = m[n]\r
237 except KeyError:\r
238 continue\r
239 if key and (delimiters is None or key in delimiters):\r
240 delims[key] = delims.get(key, 0) + 1\r
241 try:\r
242 n = regexp.groupindex['space'] - 1\r
243 except KeyError:\r
244 continue\r
245 if m[n]:\r
246 spaces += 1\r
247\r
248 quotechar = reduce(lambda a, b, quotes = quotes:\r
249 (quotes[a] > quotes[b]) and a or b, quotes.keys())\r
250\r
251 if delims:\r
252 delim = reduce(lambda a, b, delims = delims:\r
253 (delims[a] > delims[b]) and a or b, delims.keys())\r
254 skipinitialspace = delims[delim] == spaces\r
255 if delim == '\n': # most likely a file with a single column\r
256 delim = ''\r
257 else:\r
258 # there is *no* delimiter, it's a single column of quoted data\r
259 delim = ''\r
260 skipinitialspace = 0\r
261\r
262 # if we see an extra quote between delimiters, we've got a\r
263 # double quoted format\r
264 dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \\r
265 {'delim':delim, 'quote':quotechar}, re.MULTILINE)\r
266\r
267\r
268\r
269 if dq_regexp.search(data):\r
270 doublequote = True\r
271 else:\r
272 doublequote = False\r
273\r
274 return (quotechar, doublequote, delim, skipinitialspace)\r
275\r
276\r
277 def _guess_delimiter(self, data, delimiters):\r
278 """\r
279 The delimiter /should/ occur the same number of times on\r
280 each row. However, due to malformed data, it may not. We don't want\r
281 an all or nothing approach, so we allow for small variations in this\r
282 number.\r
283 1) build a table of the frequency of each character on every line.\r
284 2) build a table of frequencies of this frequency (meta-frequency?),\r
285 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,\r
286 7 times in 2 rows'\r
287 3) use the mode of the meta-frequency to determine the /expected/\r
288 frequency for that character\r
289 4) find out how often the character actually meets that goal\r
290 5) the character that best meets its goal is the delimiter\r
291 For performance reasons, the data is evaluated in chunks, so it can\r
292 try and evaluate the smallest portion of the data possible, evaluating\r
293 additional chunks as necessary.\r
294 """\r
295\r
296 data = filter(None, data.split('\n'))\r
297\r
298 ascii = [chr(c) for c in range(127)] # 7-bit ASCII\r
299\r
300 # build frequency tables\r
301 chunkLength = min(10, len(data))\r
302 iteration = 0\r
303 charFrequency = {}\r
304 modes = {}\r
305 delims = {}\r
306 start, end = 0, min(chunkLength, len(data))\r
307 while start < len(data):\r
308 iteration += 1\r
309 for line in data[start:end]:\r
310 for char in ascii:\r
311 metaFrequency = charFrequency.get(char, {})\r
312 # must count even if frequency is 0\r
313 freq = line.count(char)\r
314 # value is the mode\r
315 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1\r
316 charFrequency[char] = metaFrequency\r
317\r
318 for char in charFrequency.keys():\r
319 items = charFrequency[char].items()\r
320 if len(items) == 1 and items[0][0] == 0:\r
321 continue\r
322 # get the mode of the frequencies\r
323 if len(items) > 1:\r
324 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,\r
325 items)\r
326 # adjust the mode - subtract the sum of all\r
327 # other frequencies\r
328 items.remove(modes[char])\r
329 modes[char] = (modes[char][0], modes[char][1]\r
330 - reduce(lambda a, b: (0, a[1] + b[1]),\r
331 items)[1])\r
332 else:\r
333 modes[char] = items[0]\r
334\r
335 # build a list of possible delimiters\r
336 modeList = modes.items()\r
337 total = float(chunkLength * iteration)\r
338 # (rows of consistent data) / (number of rows) = 100%\r
339 consistency = 1.0\r
340 # minimum consistency threshold\r
341 threshold = 0.9\r
342 while len(delims) == 0 and consistency >= threshold:\r
343 for k, v in modeList:\r
344 if v[0] > 0 and v[1] > 0:\r
345 if ((v[1]/total) >= consistency and\r
346 (delimiters is None or k in delimiters)):\r
347 delims[k] = v\r
348 consistency -= 0.01\r
349\r
350 if len(delims) == 1:\r
351 delim = delims.keys()[0]\r
352 skipinitialspace = (data[0].count(delim) ==\r
353 data[0].count("%c " % delim))\r
354 return (delim, skipinitialspace)\r
355\r
356 # analyze another chunkLength lines\r
357 start = end\r
358 end += chunkLength\r
359\r
360 if not delims:\r
361 return ('', 0)\r
362\r
363 # if there's more than one, fall back to a 'preferred' list\r
364 if len(delims) > 1:\r
365 for d in self.preferred:\r
366 if d in delims.keys():\r
367 skipinitialspace = (data[0].count(d) ==\r
368 data[0].count("%c " % d))\r
369 return (d, skipinitialspace)\r
370\r
371 # nothing else indicates a preference, pick the character that\r
372 # dominates(?)\r
373 items = [(v,k) for (k,v) in delims.items()]\r
374 items.sort()\r
375 delim = items[-1][1]\r
376\r
377 skipinitialspace = (data[0].count(delim) ==\r
378 data[0].count("%c " % delim))\r
379 return (delim, skipinitialspace)\r
380\r
381\r
382 def has_header(self, sample):\r
383 # Creates a dictionary of types of data in each column. If any\r
384 # column is of a single type (say, integers), *except* for the first\r
385 # row, then the first row is presumed to be labels. If the type\r
386 # can't be determined, it is assumed to be a string in which case\r
387 # the length of the string is the determining factor: if all of the\r
388 # rows except for the first are the same length, it's a header.\r
389 # Finally, a 'vote' is taken at the end for each column, adding or\r
390 # subtracting from the likelihood of the first row being a header.\r
391\r
392 rdr = reader(StringIO(sample), self.sniff(sample))\r
393\r
394 header = rdr.next() # assume first row is header\r
395\r
396 columns = len(header)\r
397 columnTypes = {}\r
398 for i in range(columns): columnTypes[i] = None\r
399\r
400 checked = 0\r
401 for row in rdr:\r
402 # arbitrary number of rows to check, to keep it sane\r
403 if checked > 20:\r
404 break\r
405 checked += 1\r
406\r
407 if len(row) != columns:\r
408 continue # skip rows that have irregular number of columns\r
409\r
410 for col in columnTypes.keys():\r
411\r
412 for thisType in [int, long, float, complex]:\r
413 try:\r
414 thisType(row[col])\r
415 break\r
416 except (ValueError, OverflowError):\r
417 pass\r
418 else:\r
419 # fallback to length of string\r
420 thisType = len(row[col])\r
421\r
422 # treat longs as ints\r
423 if thisType == long:\r
424 thisType = int\r
425\r
426 if thisType != columnTypes[col]:\r
427 if columnTypes[col] is None: # add new column type\r
428 columnTypes[col] = thisType\r
429 else:\r
430 # type is inconsistent, remove column from\r
431 # consideration\r
432 del columnTypes[col]\r
433\r
434 # finally, compare results against first row and "vote"\r
435 # on whether it's a header\r
436 hasHeader = 0\r
437 for col, colType in columnTypes.items():\r
438 if type(colType) == type(0): # it's a length\r
439 if len(header[col]) != colType:\r
440 hasHeader += 1\r
441 else:\r
442 hasHeader -= 1\r
443 else: # attempt typecast\r
444 try:\r
445 colType(header[col])\r
446 except (ValueError, TypeError):\r
447 hasHeader += 1\r
448 else:\r
449 hasHeader -= 1\r
450\r
451 return hasHeader > 0\r