]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.10/Lib/csv.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / csv.py
CommitLineData
3257aa99
DM
1\r
2"""\r
3csv.py - read/write/investigate CSV files\r
4"""\r
5\r
6import re\r
7from functools import reduce\r
8from _csv import Error, __version__, writer, reader, register_dialect, \\r
9 unregister_dialect, get_dialect, list_dialects, \\r
10 field_size_limit, \\r
11 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \\r
12 __doc__\r
13from _csv import Dialect as _Dialect\r
14\r
15try:\r
16 from cStringIO import StringIO\r
17except ImportError:\r
18 from StringIO import StringIO\r
19\r
20__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",\r
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",\r
22 "field_size_limit", "reader", "writer",\r
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",\r
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]\r
25\r
26class Dialect:\r
27 """Describe an Excel dialect.\r
28\r
29 This must be subclassed (see csv.excel). Valid attributes are:\r
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,\r
31 lineterminator, quoting.\r
32\r
33 """\r
34 _name = ""\r
35 _valid = False\r
36 # placeholders\r
37 delimiter = None\r
38 quotechar = None\r
39 escapechar = None\r
40 doublequote = None\r
41 skipinitialspace = None\r
42 lineterminator = None\r
43 quoting = None\r
44\r
45 def __init__(self):\r
46 if self.__class__ != Dialect:\r
47 self._valid = True\r
48 self._validate()\r
49\r
50 def _validate(self):\r
51 try:\r
52 _Dialect(self)\r
53 except TypeError, e:\r
54 # We do this for compatibility with py2.3\r
55 raise Error(str(e))\r
56\r
57class excel(Dialect):\r
58 """Describe the usual properties of Excel-generated CSV files."""\r
59 delimiter = ','\r
60 quotechar = '"'\r
61 doublequote = True\r
62 skipinitialspace = False\r
63 lineterminator = '\r\n'\r
64 quoting = QUOTE_MINIMAL\r
65register_dialect("excel", excel)\r
66\r
67class excel_tab(excel):\r
68 """Describe the usual properties of Excel-generated TAB-delimited files."""\r
69 delimiter = '\t'\r
70register_dialect("excel-tab", excel_tab)\r
71\r
72\r
73class DictReader:\r
74 def __init__(self, f, fieldnames=None, restkey=None, restval=None,\r
75 dialect="excel", *args, **kwds):\r
76 self._fieldnames = fieldnames # list of keys for the dict\r
77 self.restkey = restkey # key to catch long rows\r
78 self.restval = restval # default value for short rows\r
79 self.reader = reader(f, dialect, *args, **kwds)\r
80 self.dialect = dialect\r
81 self.line_num = 0\r
82\r
83 def __iter__(self):\r
84 return self\r
85\r
86 @property\r
87 def fieldnames(self):\r
88 if self._fieldnames is None:\r
89 try:\r
90 self._fieldnames = self.reader.next()\r
91 except StopIteration:\r
92 pass\r
93 self.line_num = self.reader.line_num\r
94 return self._fieldnames\r
95\r
96 # Issue 20004: Because DictReader is a classic class, this setter is\r
97 # ignored. At this point in 2.7's lifecycle, it is too late to change the\r
98 # base class for fear of breaking working code. If you want to change\r
99 # fieldnames without overwriting the getter, set _fieldnames directly.\r
100 @fieldnames.setter\r
101 def fieldnames(self, value):\r
102 self._fieldnames = value\r
103\r
104 def next(self):\r
105 if self.line_num == 0:\r
106 # Used only for its side effect.\r
107 self.fieldnames\r
108 row = self.reader.next()\r
109 self.line_num = self.reader.line_num\r
110\r
111 # unlike the basic reader, we prefer not to return blanks,\r
112 # because we will typically wind up with a dict full of None\r
113 # values\r
114 while row == []:\r
115 row = self.reader.next()\r
116 d = dict(zip(self.fieldnames, row))\r
117 lf = len(self.fieldnames)\r
118 lr = len(row)\r
119 if lf < lr:\r
120 d[self.restkey] = row[lf:]\r
121 elif lf > lr:\r
122 for key in self.fieldnames[lr:]:\r
123 d[key] = self.restval\r
124 return d\r
125\r
126\r
127class DictWriter:\r
128 def __init__(self, f, fieldnames, restval="", extrasaction="raise",\r
129 dialect="excel", *args, **kwds):\r
130 self.fieldnames = fieldnames # list of keys for the dict\r
131 self.restval = restval # for writing short dicts\r
132 if extrasaction.lower() not in ("raise", "ignore"):\r
133 raise ValueError, \\r
134 ("extrasaction (%s) must be 'raise' or 'ignore'" %\r
135 extrasaction)\r
136 self.extrasaction = extrasaction\r
137 self.writer = writer(f, dialect, *args, **kwds)\r
138\r
139 def writeheader(self):\r
140 header = dict(zip(self.fieldnames, self.fieldnames))\r
141 self.writerow(header)\r
142\r
143 def _dict_to_list(self, rowdict):\r
144 if self.extrasaction == "raise":\r
145 wrong_fields = [k for k in rowdict if k not in self.fieldnames]\r
146 if wrong_fields:\r
147 raise ValueError("dict contains fields not in fieldnames: "\r
148 + ", ".join([repr(x) for x in wrong_fields]))\r
149 return [rowdict.get(key, self.restval) for key in self.fieldnames]\r
150\r
151 def writerow(self, rowdict):\r
152 return self.writer.writerow(self._dict_to_list(rowdict))\r
153\r
154 def writerows(self, rowdicts):\r
155 rows = []\r
156 for rowdict in rowdicts:\r
157 rows.append(self._dict_to_list(rowdict))\r
158 return self.writer.writerows(rows)\r
159\r
160# Guard Sniffer's type checking against builds that exclude complex()\r
161try:\r
162 complex\r
163except NameError:\r
164 complex = float\r
165\r
166class Sniffer:\r
167 '''\r
168 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)\r
169 Returns a Dialect object.\r
170 '''\r
171 def __init__(self):\r
172 # in case there is more than one possible delimiter\r
173 self.preferred = [',', '\t', ';', ' ', ':']\r
174\r
175\r
176 def sniff(self, sample, delimiters=None):\r
177 """\r
178 Returns a dialect (or None) corresponding to the sample\r
179 """\r
180\r
181 quotechar, doublequote, delimiter, skipinitialspace = \\r
182 self._guess_quote_and_delimiter(sample, delimiters)\r
183 if not delimiter:\r
184 delimiter, skipinitialspace = self._guess_delimiter(sample,\r
185 delimiters)\r
186\r
187 if not delimiter:\r
188 raise Error, "Could not determine delimiter"\r
189\r
190 class dialect(Dialect):\r
191 _name = "sniffed"\r
192 lineterminator = '\r\n'\r
193 quoting = QUOTE_MINIMAL\r
194 # escapechar = ''\r
195\r
196 dialect.doublequote = doublequote\r
197 dialect.delimiter = delimiter\r
198 # _csv.reader won't accept a quotechar of ''\r
199 dialect.quotechar = quotechar or '"'\r
200 dialect.skipinitialspace = skipinitialspace\r
201\r
202 return dialect\r
203\r
204\r
205 def _guess_quote_and_delimiter(self, data, delimiters):\r
206 """\r
207 Looks for text enclosed between two identical quotes\r
208 (the probable quotechar) which are preceded and followed\r
209 by the same character (the probable delimiter).\r
210 For example:\r
211 ,'some text',\r
212 The quote with the most wins, same with the delimiter.\r
213 If there is no quotechar the delimiter can't be determined\r
214 this way.\r
215 """\r
216\r
217 matches = []\r
218 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",\r
219 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",\r
220 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"\r
221 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)\r
222 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)\r
223 matches = regexp.findall(data)\r
224 if matches:\r
225 break\r
226\r
227 if not matches:\r
228 # (quotechar, doublequote, delimiter, skipinitialspace)\r
229 return ('', False, None, 0)\r
230 quotes = {}\r
231 delims = {}\r
232 spaces = 0\r
233 for m in matches:\r
234 n = regexp.groupindex['quote'] - 1\r
235 key = m[n]\r
236 if key:\r
237 quotes[key] = quotes.get(key, 0) + 1\r
238 try:\r
239 n = regexp.groupindex['delim'] - 1\r
240 key = m[n]\r
241 except KeyError:\r
242 continue\r
243 if key and (delimiters is None or key in delimiters):\r
244 delims[key] = delims.get(key, 0) + 1\r
245 try:\r
246 n = regexp.groupindex['space'] - 1\r
247 except KeyError:\r
248 continue\r
249 if m[n]:\r
250 spaces += 1\r
251\r
252 quotechar = reduce(lambda a, b, quotes = quotes:\r
253 (quotes[a] > quotes[b]) and a or b, quotes.keys())\r
254\r
255 if delims:\r
256 delim = reduce(lambda a, b, delims = delims:\r
257 (delims[a] > delims[b]) and a or b, delims.keys())\r
258 skipinitialspace = delims[delim] == spaces\r
259 if delim == '\n': # most likely a file with a single column\r
260 delim = ''\r
261 else:\r
262 # there is *no* delimiter, it's a single column of quoted data\r
263 delim = ''\r
264 skipinitialspace = 0\r
265\r
266 # if we see an extra quote between delimiters, we've got a\r
267 # double quoted format\r
268 dq_regexp = re.compile(\r
269 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \\r
270 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)\r
271\r
272\r
273\r
274 if dq_regexp.search(data):\r
275 doublequote = True\r
276 else:\r
277 doublequote = False\r
278\r
279 return (quotechar, doublequote, delim, skipinitialspace)\r
280\r
281\r
282 def _guess_delimiter(self, data, delimiters):\r
283 """\r
284 The delimiter /should/ occur the same number of times on\r
285 each row. However, due to malformed data, it may not. We don't want\r
286 an all or nothing approach, so we allow for small variations in this\r
287 number.\r
288 1) build a table of the frequency of each character on every line.\r
289 2) build a table of frequencies of this frequency (meta-frequency?),\r
290 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,\r
291 7 times in 2 rows'\r
292 3) use the mode of the meta-frequency to determine the /expected/\r
293 frequency for that character\r
294 4) find out how often the character actually meets that goal\r
295 5) the character that best meets its goal is the delimiter\r
296 For performance reasons, the data is evaluated in chunks, so it can\r
297 try and evaluate the smallest portion of the data possible, evaluating\r
298 additional chunks as necessary.\r
299 """\r
300\r
301 data = filter(None, data.split('\n'))\r
302\r
303 ascii = [chr(c) for c in range(127)] # 7-bit ASCII\r
304\r
305 # build frequency tables\r
306 chunkLength = min(10, len(data))\r
307 iteration = 0\r
308 charFrequency = {}\r
309 modes = {}\r
310 delims = {}\r
311 start, end = 0, min(chunkLength, len(data))\r
312 while start < len(data):\r
313 iteration += 1\r
314 for line in data[start:end]:\r
315 for char in ascii:\r
316 metaFrequency = charFrequency.get(char, {})\r
317 # must count even if frequency is 0\r
318 freq = line.count(char)\r
319 # value is the mode\r
320 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1\r
321 charFrequency[char] = metaFrequency\r
322\r
323 for char in charFrequency.keys():\r
324 items = charFrequency[char].items()\r
325 if len(items) == 1 and items[0][0] == 0:\r
326 continue\r
327 # get the mode of the frequencies\r
328 if len(items) > 1:\r
329 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,\r
330 items)\r
331 # adjust the mode - subtract the sum of all\r
332 # other frequencies\r
333 items.remove(modes[char])\r
334 modes[char] = (modes[char][0], modes[char][1]\r
335 - reduce(lambda a, b: (0, a[1] + b[1]),\r
336 items)[1])\r
337 else:\r
338 modes[char] = items[0]\r
339\r
340 # build a list of possible delimiters\r
341 modeList = modes.items()\r
342 total = float(chunkLength * iteration)\r
343 # (rows of consistent data) / (number of rows) = 100%\r
344 consistency = 1.0\r
345 # minimum consistency threshold\r
346 threshold = 0.9\r
347 while len(delims) == 0 and consistency >= threshold:\r
348 for k, v in modeList:\r
349 if v[0] > 0 and v[1] > 0:\r
350 if ((v[1]/total) >= consistency and\r
351 (delimiters is None or k in delimiters)):\r
352 delims[k] = v\r
353 consistency -= 0.01\r
354\r
355 if len(delims) == 1:\r
356 delim = delims.keys()[0]\r
357 skipinitialspace = (data[0].count(delim) ==\r
358 data[0].count("%c " % delim))\r
359 return (delim, skipinitialspace)\r
360\r
361 # analyze another chunkLength lines\r
362 start = end\r
363 end += chunkLength\r
364\r
365 if not delims:\r
366 return ('', 0)\r
367\r
368 # if there's more than one, fall back to a 'preferred' list\r
369 if len(delims) > 1:\r
370 for d in self.preferred:\r
371 if d in delims.keys():\r
372 skipinitialspace = (data[0].count(d) ==\r
373 data[0].count("%c " % d))\r
374 return (d, skipinitialspace)\r
375\r
376 # nothing else indicates a preference, pick the character that\r
377 # dominates(?)\r
378 items = [(v,k) for (k,v) in delims.items()]\r
379 items.sort()\r
380 delim = items[-1][1]\r
381\r
382 skipinitialspace = (data[0].count(delim) ==\r
383 data[0].count("%c " % delim))\r
384 return (delim, skipinitialspace)\r
385\r
386\r
387 def has_header(self, sample):\r
388 # Creates a dictionary of types of data in each column. If any\r
389 # column is of a single type (say, integers), *except* for the first\r
390 # row, then the first row is presumed to be labels. If the type\r
391 # can't be determined, it is assumed to be a string in which case\r
392 # the length of the string is the determining factor: if all of the\r
393 # rows except for the first are the same length, it's a header.\r
394 # Finally, a 'vote' is taken at the end for each column, adding or\r
395 # subtracting from the likelihood of the first row being a header.\r
396\r
397 rdr = reader(StringIO(sample), self.sniff(sample))\r
398\r
399 header = rdr.next() # assume first row is header\r
400\r
401 columns = len(header)\r
402 columnTypes = {}\r
403 for i in range(columns): columnTypes[i] = None\r
404\r
405 checked = 0\r
406 for row in rdr:\r
407 # arbitrary number of rows to check, to keep it sane\r
408 if checked > 20:\r
409 break\r
410 checked += 1\r
411\r
412 if len(row) != columns:\r
413 continue # skip rows that have irregular number of columns\r
414\r
415 for col in columnTypes.keys():\r
416\r
417 for thisType in [int, long, float, complex]:\r
418 try:\r
419 thisType(row[col])\r
420 break\r
421 except (ValueError, OverflowError):\r
422 pass\r
423 else:\r
424 # fallback to length of string\r
425 thisType = len(row[col])\r
426\r
427 # treat longs as ints\r
428 if thisType == long:\r
429 thisType = int\r
430\r
431 if thisType != columnTypes[col]:\r
432 if columnTypes[col] is None: # add new column type\r
433 columnTypes[col] = thisType\r
434 else:\r
435 # type is inconsistent, remove column from\r
436 # consideration\r
437 del columnTypes[col]\r
438\r
439 # finally, compare results against first row and "vote"\r
440 # on whether it's a header\r
441 hasHeader = 0\r
442 for col, colType in columnTypes.items():\r
443 if type(colType) == type(0): # it's a length\r
444 if len(header[col]) != colType:\r
445 hasHeader += 1\r
446 else:\r
447 hasHeader -= 1\r
448 else: # attempt typecast\r
449 try:\r
450 colType(header[col])\r
451 except (ValueError, TypeError):\r
452 hasHeader += 1\r
453 else:\r
454 hasHeader -= 1\r
455\r
456 return hasHeader > 0\r