]>
git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Lib/csv.py
081df79c8b2b64bff8d29c4765e11ab5b4fe4a57
3 csv.py - read/write/investigate CSV files
7 from functools
import reduce
8 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
9 unregister_dialect
, get_dialect
, list_dialects
, \
11 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
13 from _csv
import Dialect
as _Dialect
16 from cStringIO
import StringIO
18 from StringIO
import StringIO
20 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
27 """Describe an Excel dialect.
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
41 skipinitialspace
= None
46 if self
.__class
__ != Dialect
:
54 # We do this for compatibility with py2.3
58 """Describe the usual properties of Excel-generated CSV files."""
62 skipinitialspace
= False
63 lineterminator
= '\r\n'
64 quoting
= QUOTE_MINIMAL
65 register_dialect("excel", excel
)
67 class excel_tab(excel
):
68 """Describe the usual properties of Excel-generated TAB-delimited files."""
70 register_dialect("excel-tab", excel_tab
)
74 def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
75 dialect
="excel", *args
, **kwds
):
76 self
._fieldnames
= fieldnames
# list of keys for the dict
77 self
.restkey
= restkey
# key to catch long rows
78 self
.restval
= restval
# default value for short rows
79 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
80 self
.dialect
= dialect
88 if self
._fieldnames
is None:
90 self
._fieldnames
= self
.reader
.next()
93 self
.line_num
= self
.reader
.line_num
94 return self
._fieldnames
96 # Issue 20004: Because DictReader is a classic class, this setter is
97 # ignored. At this point in 2.7's lifecycle, it is too late to change the
98 # base class for fear of breaking working code. If you want to change
99 # fieldnames without overwriting the getter, set _fieldnames directly.
101 def fieldnames(self
, value
):
102 self
._fieldnames
= value
105 if self
.line_num
== 0:
106 # Used only for its side effect.
108 row
= self
.reader
.next()
109 self
.line_num
= self
.reader
.line_num
111 # unlike the basic reader, we prefer not to return blanks,
112 # because we will typically wind up with a dict full of None
115 row
= self
.reader
.next()
116 d
= dict(zip(self
.fieldnames
, row
))
117 lf
= len(self
.fieldnames
)
120 d
[self
.restkey
] = row
[lf
:]
122 for key
in self
.fieldnames
[lr
:]:
123 d
[key
] = self
.restval
128 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
129 dialect
="excel", *args
, **kwds
):
130 self
.fieldnames
= fieldnames
# list of keys for the dict
131 self
.restval
= restval
# for writing short dicts
132 if extrasaction
.lower() not in ("raise", "ignore"):
134 ("extrasaction (%s) must be 'raise' or 'ignore'" %
136 self
.extrasaction
= extrasaction
137 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
139 def writeheader(self
):
140 header
= dict(zip(self
.fieldnames
, self
.fieldnames
))
141 self
.writerow(header
)
143 def _dict_to_list(self
, rowdict
):
144 if self
.extrasaction
== "raise":
145 wrong_fields
= [k
for k
in rowdict
if k
not in self
.fieldnames
]
147 raise ValueError("dict contains fields not in fieldnames: "
148 + ", ".join([repr(x
) for x
in wrong_fields
]))
149 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
151 def writerow(self
, rowdict
):
152 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
154 def writerows(self
, rowdicts
):
156 for rowdict
in rowdicts
:
157 rows
.append(self
._dict
_to
_list
(rowdict
))
158 return self
.writer
.writerows(rows
)
160 # Guard Sniffer's type checking against builds that exclude complex()
168 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
169 Returns a Dialect object.
172 # in case there is more than one possible delimiter
173 self
.preferred
= [',', '\t', ';', ' ', ':']
176 def sniff(self
, sample
, delimiters
=None):
178 Returns a dialect (or None) corresponding to the sample
181 quotechar
, doublequote
, delimiter
, skipinitialspace
= \
182 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
184 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
188 raise Error
, "Could not determine delimiter"
190 class dialect(Dialect
):
192 lineterminator
= '\r\n'
193 quoting
= QUOTE_MINIMAL
196 dialect
.doublequote
= doublequote
197 dialect
.delimiter
= delimiter
198 # _csv.reader won't accept a quotechar of ''
199 dialect
.quotechar
= quotechar
or '"'
200 dialect
.skipinitialspace
= skipinitialspace
205 def _guess_quote_and_delimiter(self
, data
, delimiters
):
207 Looks for text enclosed between two identical quotes
208 (the probable quotechar) which are preceded and followed
209 by the same character (the probable delimiter).
212 The quote with the most wins, same with the delimiter.
213 If there is no quotechar the delimiter can't be determined
218 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
219 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
220 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
221 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
222 regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
223 matches
= regexp
.findall(data
)
228 # (quotechar, doublequote, delimiter, skipinitialspace)
229 return ('', False, None, 0)
234 n
= regexp
.groupindex
['quote'] - 1
237 quotes
[key
] = quotes
.get(key
, 0) + 1
239 n
= regexp
.groupindex
['delim'] - 1
243 if key
and (delimiters
is None or key
in delimiters
):
244 delims
[key
] = delims
.get(key
, 0) + 1
246 n
= regexp
.groupindex
['space'] - 1
252 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
253 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
256 delim
= reduce(lambda a
, b
, delims
= delims
:
257 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
258 skipinitialspace
= delims
[delim
] == spaces
259 if delim
== '\n': # most likely a file with a single column
262 # there is *no* delimiter, it's a single column of quoted data
266 # if we see an extra quote between delimiters, we've got a
267 # double quoted format
268 dq_regexp
= re
.compile(
269 r
"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
270 {'delim':re
.escape(delim
), 'quote':quotechar
}, re
.MULTILINE
)
274 if dq_regexp
.search(data
):
279 return (quotechar
, doublequote
, delim
, skipinitialspace
)
282 def _guess_delimiter(self
, data
, delimiters
):
284 The delimiter /should/ occur the same number of times on
285 each row. However, due to malformed data, it may not. We don't want
286 an all or nothing approach, so we allow for small variations in this
288 1) build a table of the frequency of each character on every line.
289 2) build a table of frequencies of this frequency (meta-frequency?),
290 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
292 3) use the mode of the meta-frequency to determine the /expected/
293 frequency for that character
294 4) find out how often the character actually meets that goal
295 5) the character that best meets its goal is the delimiter
296 For performance reasons, the data is evaluated in chunks, so it can
297 try and evaluate the smallest portion of the data possible, evaluating
298 additional chunks as necessary.
301 data
= filter(None, data
.split('\n'))
303 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
305 # build frequency tables
306 chunkLength
= min(10, len(data
))
311 start
, end
= 0, min(chunkLength
, len(data
))
312 while start
< len(data
):
314 for line
in data
[start
:end
]:
316 metaFrequency
= charFrequency
.get(char
, {})
317 # must count even if frequency is 0
318 freq
= line
.count(char
)
320 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
321 charFrequency
[char
] = metaFrequency
323 for char
in charFrequency
.keys():
324 items
= charFrequency
[char
].items()
325 if len(items
) == 1 and items
[0][0] == 0:
327 # get the mode of the frequencies
329 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
331 # adjust the mode - subtract the sum of all
333 items
.remove(modes
[char
])
334 modes
[char
] = (modes
[char
][0], modes
[char
][1]
335 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
338 modes
[char
] = items
[0]
340 # build a list of possible delimiters
341 modeList
= modes
.items()
342 total
= float(chunkLength
* iteration
)
343 # (rows of consistent data) / (number of rows) = 100%
345 # minimum consistency threshold
347 while len(delims
) == 0 and consistency
>= threshold
:
348 for k
, v
in modeList
:
349 if v
[0] > 0 and v
[1] > 0:
350 if ((v
[1]/total
) >= consistency
and
351 (delimiters
is None or k
in delimiters
)):
356 delim
= delims
.keys()[0]
357 skipinitialspace
= (data
[0].count(delim
) ==
358 data
[0].count("%c " % delim
))
359 return (delim
, skipinitialspace
)
361 # analyze another chunkLength lines
368 # if there's more than one, fall back to a 'preferred' list
370 for d
in self
.preferred
:
371 if d
in delims
.keys():
372 skipinitialspace
= (data
[0].count(d
) ==
373 data
[0].count("%c " % d
))
374 return (d
, skipinitialspace
)
376 # nothing else indicates a preference, pick the character that
378 items
= [(v
,k
) for (k
,v
) in delims
.items()]
382 skipinitialspace
= (data
[0].count(delim
) ==
383 data
[0].count("%c " % delim
))
384 return (delim
, skipinitialspace
)
387 def has_header(self
, sample
):
388 # Creates a dictionary of types of data in each column. If any
389 # column is of a single type (say, integers), *except* for the first
390 # row, then the first row is presumed to be labels. If the type
391 # can't be determined, it is assumed to be a string in which case
392 # the length of the string is the determining factor: if all of the
393 # rows except for the first are the same length, it's a header.
394 # Finally, a 'vote' is taken at the end for each column, adding or
395 # subtracting from the likelihood of the first row being a header.
397 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
399 header
= rdr
.next() # assume first row is header
401 columns
= len(header
)
403 for i
in range(columns
): columnTypes
[i
] = None
407 # arbitrary number of rows to check, to keep it sane
412 if len(row
) != columns
:
413 continue # skip rows that have irregular number of columns
415 for col
in columnTypes
.keys():
417 for thisType
in [int, long, float, complex]:
421 except (ValueError, OverflowError):
424 # fallback to length of string
425 thisType
= len(row
[col
])
427 # treat longs as ints
431 if thisType
!= columnTypes
[col
]:
432 if columnTypes
[col
] is None: # add new column type
433 columnTypes
[col
] = thisType
435 # type is inconsistent, remove column from
439 # finally, compare results against first row and "vote"
440 # on whether it's a header
442 for col
, colType
in columnTypes
.items():
443 if type(colType
) == type(0): # it's a length
444 if len(header
[col
]) != colType
:
448 else: # attempt typecast
451 except (ValueError, TypeError):