]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Lib/csv.py
081df79c8b2b64bff8d29c4765e11ab5b4fe4a57
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / csv.py
1
2 """
3 csv.py - read/write/investigate CSV files
4 """
5
6 import re
7 from functools import reduce
8 from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
10 field_size_limit, \
11 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12 __doc__
13 from _csv import Dialect as _Dialect
14
15 try:
16 from cStringIO import StringIO
17 except ImportError:
18 from StringIO import StringIO
19
20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26 class Dialect:
27 """Describe an Excel dialect.
28
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
32
33 """
34 _name = ""
35 _valid = False
36 # placeholders
37 delimiter = None
38 quotechar = None
39 escapechar = None
40 doublequote = None
41 skipinitialspace = None
42 lineterminator = None
43 quoting = None
44
45 def __init__(self):
46 if self.__class__ != Dialect:
47 self._valid = True
48 self._validate()
49
50 def _validate(self):
51 try:
52 _Dialect(self)
53 except TypeError, e:
54 # We do this for compatibility with py2.3
55 raise Error(str(e))
56
57 class excel(Dialect):
58 """Describe the usual properties of Excel-generated CSV files."""
59 delimiter = ','
60 quotechar = '"'
61 doublequote = True
62 skipinitialspace = False
63 lineterminator = '\r\n'
64 quoting = QUOTE_MINIMAL
65 register_dialect("excel", excel)
66
67 class excel_tab(excel):
68 """Describe the usual properties of Excel-generated TAB-delimited files."""
69 delimiter = '\t'
70 register_dialect("excel-tab", excel_tab)
71
72
73 class DictReader:
74 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
75 dialect="excel", *args, **kwds):
76 self._fieldnames = fieldnames # list of keys for the dict
77 self.restkey = restkey # key to catch long rows
78 self.restval = restval # default value for short rows
79 self.reader = reader(f, dialect, *args, **kwds)
80 self.dialect = dialect
81 self.line_num = 0
82
83 def __iter__(self):
84 return self
85
86 @property
87 def fieldnames(self):
88 if self._fieldnames is None:
89 try:
90 self._fieldnames = self.reader.next()
91 except StopIteration:
92 pass
93 self.line_num = self.reader.line_num
94 return self._fieldnames
95
96 # Issue 20004: Because DictReader is a classic class, this setter is
97 # ignored. At this point in 2.7's lifecycle, it is too late to change the
98 # base class for fear of breaking working code. If you want to change
99 # fieldnames without overwriting the getter, set _fieldnames directly.
100 @fieldnames.setter
101 def fieldnames(self, value):
102 self._fieldnames = value
103
104 def next(self):
105 if self.line_num == 0:
106 # Used only for its side effect.
107 self.fieldnames
108 row = self.reader.next()
109 self.line_num = self.reader.line_num
110
111 # unlike the basic reader, we prefer not to return blanks,
112 # because we will typically wind up with a dict full of None
113 # values
114 while row == []:
115 row = self.reader.next()
116 d = dict(zip(self.fieldnames, row))
117 lf = len(self.fieldnames)
118 lr = len(row)
119 if lf < lr:
120 d[self.restkey] = row[lf:]
121 elif lf > lr:
122 for key in self.fieldnames[lr:]:
123 d[key] = self.restval
124 return d
125
126
127 class DictWriter:
128 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
129 dialect="excel", *args, **kwds):
130 self.fieldnames = fieldnames # list of keys for the dict
131 self.restval = restval # for writing short dicts
132 if extrasaction.lower() not in ("raise", "ignore"):
133 raise ValueError, \
134 ("extrasaction (%s) must be 'raise' or 'ignore'" %
135 extrasaction)
136 self.extrasaction = extrasaction
137 self.writer = writer(f, dialect, *args, **kwds)
138
139 def writeheader(self):
140 header = dict(zip(self.fieldnames, self.fieldnames))
141 self.writerow(header)
142
143 def _dict_to_list(self, rowdict):
144 if self.extrasaction == "raise":
145 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
146 if wrong_fields:
147 raise ValueError("dict contains fields not in fieldnames: "
148 + ", ".join([repr(x) for x in wrong_fields]))
149 return [rowdict.get(key, self.restval) for key in self.fieldnames]
150
151 def writerow(self, rowdict):
152 return self.writer.writerow(self._dict_to_list(rowdict))
153
154 def writerows(self, rowdicts):
155 rows = []
156 for rowdict in rowdicts:
157 rows.append(self._dict_to_list(rowdict))
158 return self.writer.writerows(rows)
159
160 # Guard Sniffer's type checking against builds that exclude complex()
161 try:
162 complex
163 except NameError:
164 complex = float
165
166 class Sniffer:
167 '''
168 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
169 Returns a Dialect object.
170 '''
171 def __init__(self):
172 # in case there is more than one possible delimiter
173 self.preferred = [',', '\t', ';', ' ', ':']
174
175
176 def sniff(self, sample, delimiters=None):
177 """
178 Returns a dialect (or None) corresponding to the sample
179 """
180
181 quotechar, doublequote, delimiter, skipinitialspace = \
182 self._guess_quote_and_delimiter(sample, delimiters)
183 if not delimiter:
184 delimiter, skipinitialspace = self._guess_delimiter(sample,
185 delimiters)
186
187 if not delimiter:
188 raise Error, "Could not determine delimiter"
189
190 class dialect(Dialect):
191 _name = "sniffed"
192 lineterminator = '\r\n'
193 quoting = QUOTE_MINIMAL
194 # escapechar = ''
195
196 dialect.doublequote = doublequote
197 dialect.delimiter = delimiter
198 # _csv.reader won't accept a quotechar of ''
199 dialect.quotechar = quotechar or '"'
200 dialect.skipinitialspace = skipinitialspace
201
202 return dialect
203
204
205 def _guess_quote_and_delimiter(self, data, delimiters):
206 """
207 Looks for text enclosed between two identical quotes
208 (the probable quotechar) which are preceded and followed
209 by the same character (the probable delimiter).
210 For example:
211 ,'some text',
212 The quote with the most wins, same with the delimiter.
213 If there is no quotechar the delimiter can't be determined
214 this way.
215 """
216
217 matches = []
218 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
219 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
220 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
221 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
222 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
223 matches = regexp.findall(data)
224 if matches:
225 break
226
227 if not matches:
228 # (quotechar, doublequote, delimiter, skipinitialspace)
229 return ('', False, None, 0)
230 quotes = {}
231 delims = {}
232 spaces = 0
233 for m in matches:
234 n = regexp.groupindex['quote'] - 1
235 key = m[n]
236 if key:
237 quotes[key] = quotes.get(key, 0) + 1
238 try:
239 n = regexp.groupindex['delim'] - 1
240 key = m[n]
241 except KeyError:
242 continue
243 if key and (delimiters is None or key in delimiters):
244 delims[key] = delims.get(key, 0) + 1
245 try:
246 n = regexp.groupindex['space'] - 1
247 except KeyError:
248 continue
249 if m[n]:
250 spaces += 1
251
252 quotechar = reduce(lambda a, b, quotes = quotes:
253 (quotes[a] > quotes[b]) and a or b, quotes.keys())
254
255 if delims:
256 delim = reduce(lambda a, b, delims = delims:
257 (delims[a] > delims[b]) and a or b, delims.keys())
258 skipinitialspace = delims[delim] == spaces
259 if delim == '\n': # most likely a file with a single column
260 delim = ''
261 else:
262 # there is *no* delimiter, it's a single column of quoted data
263 delim = ''
264 skipinitialspace = 0
265
266 # if we see an extra quote between delimiters, we've got a
267 # double quoted format
268 dq_regexp = re.compile(
269 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
270 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
271
272
273
274 if dq_regexp.search(data):
275 doublequote = True
276 else:
277 doublequote = False
278
279 return (quotechar, doublequote, delim, skipinitialspace)
280
281
282 def _guess_delimiter(self, data, delimiters):
283 """
284 The delimiter /should/ occur the same number of times on
285 each row. However, due to malformed data, it may not. We don't want
286 an all or nothing approach, so we allow for small variations in this
287 number.
288 1) build a table of the frequency of each character on every line.
289 2) build a table of frequencies of this frequency (meta-frequency?),
290 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
291 7 times in 2 rows'
292 3) use the mode of the meta-frequency to determine the /expected/
293 frequency for that character
294 4) find out how often the character actually meets that goal
295 5) the character that best meets its goal is the delimiter
296 For performance reasons, the data is evaluated in chunks, so it can
297 try and evaluate the smallest portion of the data possible, evaluating
298 additional chunks as necessary.
299 """
300
301 data = filter(None, data.split('\n'))
302
303 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
304
305 # build frequency tables
306 chunkLength = min(10, len(data))
307 iteration = 0
308 charFrequency = {}
309 modes = {}
310 delims = {}
311 start, end = 0, min(chunkLength, len(data))
312 while start < len(data):
313 iteration += 1
314 for line in data[start:end]:
315 for char in ascii:
316 metaFrequency = charFrequency.get(char, {})
317 # must count even if frequency is 0
318 freq = line.count(char)
319 # value is the mode
320 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
321 charFrequency[char] = metaFrequency
322
323 for char in charFrequency.keys():
324 items = charFrequency[char].items()
325 if len(items) == 1 and items[0][0] == 0:
326 continue
327 # get the mode of the frequencies
328 if len(items) > 1:
329 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
330 items)
331 # adjust the mode - subtract the sum of all
332 # other frequencies
333 items.remove(modes[char])
334 modes[char] = (modes[char][0], modes[char][1]
335 - reduce(lambda a, b: (0, a[1] + b[1]),
336 items)[1])
337 else:
338 modes[char] = items[0]
339
340 # build a list of possible delimiters
341 modeList = modes.items()
342 total = float(chunkLength * iteration)
343 # (rows of consistent data) / (number of rows) = 100%
344 consistency = 1.0
345 # minimum consistency threshold
346 threshold = 0.9
347 while len(delims) == 0 and consistency >= threshold:
348 for k, v in modeList:
349 if v[0] > 0 and v[1] > 0:
350 if ((v[1]/total) >= consistency and
351 (delimiters is None or k in delimiters)):
352 delims[k] = v
353 consistency -= 0.01
354
355 if len(delims) == 1:
356 delim = delims.keys()[0]
357 skipinitialspace = (data[0].count(delim) ==
358 data[0].count("%c " % delim))
359 return (delim, skipinitialspace)
360
361 # analyze another chunkLength lines
362 start = end
363 end += chunkLength
364
365 if not delims:
366 return ('', 0)
367
368 # if there's more than one, fall back to a 'preferred' list
369 if len(delims) > 1:
370 for d in self.preferred:
371 if d in delims.keys():
372 skipinitialspace = (data[0].count(d) ==
373 data[0].count("%c " % d))
374 return (d, skipinitialspace)
375
376 # nothing else indicates a preference, pick the character that
377 # dominates(?)
378 items = [(v,k) for (k,v) in delims.items()]
379 items.sort()
380 delim = items[-1][1]
381
382 skipinitialspace = (data[0].count(delim) ==
383 data[0].count("%c " % delim))
384 return (delim, skipinitialspace)
385
386
387 def has_header(self, sample):
388 # Creates a dictionary of types of data in each column. If any
389 # column is of a single type (say, integers), *except* for the first
390 # row, then the first row is presumed to be labels. If the type
391 # can't be determined, it is assumed to be a string in which case
392 # the length of the string is the determining factor: if all of the
393 # rows except for the first are the same length, it's a header.
394 # Finally, a 'vote' is taken at the end for each column, adding or
395 # subtracting from the likelihood of the first row being a header.
396
397 rdr = reader(StringIO(sample), self.sniff(sample))
398
399 header = rdr.next() # assume first row is header
400
401 columns = len(header)
402 columnTypes = {}
403 for i in range(columns): columnTypes[i] = None
404
405 checked = 0
406 for row in rdr:
407 # arbitrary number of rows to check, to keep it sane
408 if checked > 20:
409 break
410 checked += 1
411
412 if len(row) != columns:
413 continue # skip rows that have irregular number of columns
414
415 for col in columnTypes.keys():
416
417 for thisType in [int, long, float, complex]:
418 try:
419 thisType(row[col])
420 break
421 except (ValueError, OverflowError):
422 pass
423 else:
424 # fallback to length of string
425 thisType = len(row[col])
426
427 # treat longs as ints
428 if thisType == long:
429 thisType = int
430
431 if thisType != columnTypes[col]:
432 if columnTypes[col] is None: # add new column type
433 columnTypes[col] = thisType
434 else:
435 # type is inconsistent, remove column from
436 # consideration
437 del columnTypes[col]
438
439 # finally, compare results against first row and "vote"
440 # on whether it's a header
441 hasHeader = 0
442 for col, colType in columnTypes.items():
443 if type(colType) == type(0): # it's a length
444 if len(header[col]) != colType:
445 hasHeader += 1
446 else:
447 hasHeader -= 1
448 else: # attempt typecast
449 try:
450 colType(header[col])
451 except (ValueError, TypeError):
452 hasHeader += 1
453 else:
454 hasHeader -= 1
455
456 return hasHeader > 0