]>
git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.2/Lib/csv.py
3 csv.py - read/write/investigate CSV files
7 from functools
import reduce
8 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
9 unregister_dialect
, get_dialect
, list_dialects
, \
11 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
13 from _csv
import Dialect
as _Dialect
16 from cStringIO
import StringIO
18 from StringIO
import StringIO
20 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
27 """Describe an Excel dialect.
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
41 skipinitialspace
= None
46 if self
.__class
__ != Dialect
:
54 # We do this for compatibility with py2.3
58 """Describe the usual properties of Excel-generated CSV files."""
62 skipinitialspace
= False
63 lineterminator
= '\r\n'
64 quoting
= QUOTE_MINIMAL
65 register_dialect("excel", excel
)
67 class excel_tab(excel
):
68 """Describe the usual properties of Excel-generated TAB-delimited files."""
70 register_dialect("excel-tab", excel_tab
)
74 def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
75 dialect
="excel", *args
, **kwds
):
76 self
._fieldnames
= fieldnames
# list of keys for the dict
77 self
.restkey
= restkey
# key to catch long rows
78 self
.restval
= restval
# default value for short rows
79 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
80 self
.dialect
= dialect
88 if self
._fieldnames
is None:
90 self
._fieldnames
= self
.reader
.next()
93 self
.line_num
= self
.reader
.line_num
94 return self
._fieldnames
97 def fieldnames(self
, value
):
98 self
._fieldnames
= value
101 if self
.line_num
== 0:
102 # Used only for its side effect.
104 row
= self
.reader
.next()
105 self
.line_num
= self
.reader
.line_num
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
111 row
= self
.reader
.next()
112 d
= dict(zip(self
.fieldnames
, row
))
113 lf
= len(self
.fieldnames
)
116 d
[self
.restkey
] = row
[lf
:]
118 for key
in self
.fieldnames
[lr
:]:
119 d
[key
] = self
.restval
124 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
125 dialect
="excel", *args
, **kwds
):
126 self
.fieldnames
= fieldnames
# list of keys for the dict
127 self
.restval
= restval
# for writing short dicts
128 if extrasaction
.lower() not in ("raise", "ignore"):
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
132 self
.extrasaction
= extrasaction
133 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
135 def writeheader(self
):
136 header
= dict(zip(self
.fieldnames
, self
.fieldnames
))
137 self
.writerow(header
)
139 def _dict_to_list(self
, rowdict
):
140 if self
.extrasaction
== "raise":
141 wrong_fields
= [k
for k
in rowdict
if k
not in self
.fieldnames
]
143 raise ValueError("dict contains fields not in fieldnames: " +
144 ", ".join(wrong_fields
))
145 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
147 def writerow(self
, rowdict
):
148 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
150 def writerows(self
, rowdicts
):
152 for rowdict
in rowdicts
:
153 rows
.append(self
._dict
_to
_list
(rowdict
))
154 return self
.writer
.writerows(rows
)
156 # Guard Sniffer's type checking against builds that exclude complex()
164 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
165 Returns a Dialect object.
168 # in case there is more than one possible delimiter
169 self
.preferred
= [',', '\t', ';', ' ', ':']
172 def sniff(self
, sample
, delimiters
=None):
174 Returns a dialect (or None) corresponding to the sample
177 quotechar
, doublequote
, delimiter
, skipinitialspace
= \
178 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
180 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
184 raise Error
, "Could not determine delimiter"
186 class dialect(Dialect
):
188 lineterminator
= '\r\n'
189 quoting
= QUOTE_MINIMAL
192 dialect
.doublequote
= doublequote
193 dialect
.delimiter
= delimiter
194 # _csv.reader won't accept a quotechar of ''
195 dialect
.quotechar
= quotechar
or '"'
196 dialect
.skipinitialspace
= skipinitialspace
201 def _guess_quote_and_delimiter(self
, data
, delimiters
):
203 Looks for text enclosed between two identical quotes
204 (the probable quotechar) which are preceded and followed
205 by the same character (the probable delimiter).
208 The quote with the most wins, same with the delimiter.
209 If there is no quotechar the delimiter can't be determined
214 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
215 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
216 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
217 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
218 regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
219 matches
= regexp
.findall(data
)
224 # (quotechar, doublequote, delimiter, skipinitialspace)
225 return ('', False, None, 0)
230 n
= regexp
.groupindex
['quote'] - 1
233 quotes
[key
] = quotes
.get(key
, 0) + 1
235 n
= regexp
.groupindex
['delim'] - 1
239 if key
and (delimiters
is None or key
in delimiters
):
240 delims
[key
] = delims
.get(key
, 0) + 1
242 n
= regexp
.groupindex
['space'] - 1
248 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
249 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
252 delim
= reduce(lambda a
, b
, delims
= delims
:
253 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
254 skipinitialspace
= delims
[delim
] == spaces
255 if delim
== '\n': # most likely a file with a single column
258 # there is *no* delimiter, it's a single column of quoted data
262 # if we see an extra quote between delimiters, we've got a
263 # double quoted format
264 dq_regexp
= re
.compile(r
"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
265 {'delim':delim
, 'quote':quotechar
}, re
.MULTILINE
)
269 if dq_regexp
.search(data
):
274 return (quotechar
, doublequote
, delim
, skipinitialspace
)
277 def _guess_delimiter(self
, data
, delimiters
):
279 The delimiter /should/ occur the same number of times on
280 each row. However, due to malformed data, it may not. We don't want
281 an all or nothing approach, so we allow for small variations in this
283 1) build a table of the frequency of each character on every line.
284 2) build a table of frequencies of this frequency (meta-frequency?),
285 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
287 3) use the mode of the meta-frequency to determine the /expected/
288 frequency for that character
289 4) find out how often the character actually meets that goal
290 5) the character that best meets its goal is the delimiter
291 For performance reasons, the data is evaluated in chunks, so it can
292 try and evaluate the smallest portion of the data possible, evaluating
293 additional chunks as necessary.
296 data
= filter(None, data
.split('\n'))
298 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
300 # build frequency tables
301 chunkLength
= min(10, len(data
))
306 start
, end
= 0, min(chunkLength
, len(data
))
307 while start
< len(data
):
309 for line
in data
[start
:end
]:
311 metaFrequency
= charFrequency
.get(char
, {})
312 # must count even if frequency is 0
313 freq
= line
.count(char
)
315 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
316 charFrequency
[char
] = metaFrequency
318 for char
in charFrequency
.keys():
319 items
= charFrequency
[char
].items()
320 if len(items
) == 1 and items
[0][0] == 0:
322 # get the mode of the frequencies
324 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
326 # adjust the mode - subtract the sum of all
328 items
.remove(modes
[char
])
329 modes
[char
] = (modes
[char
][0], modes
[char
][1]
330 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
333 modes
[char
] = items
[0]
335 # build a list of possible delimiters
336 modeList
= modes
.items()
337 total
= float(chunkLength
* iteration
)
338 # (rows of consistent data) / (number of rows) = 100%
340 # minimum consistency threshold
342 while len(delims
) == 0 and consistency
>= threshold
:
343 for k
, v
in modeList
:
344 if v
[0] > 0 and v
[1] > 0:
345 if ((v
[1]/total
) >= consistency
and
346 (delimiters
is None or k
in delimiters
)):
351 delim
= delims
.keys()[0]
352 skipinitialspace
= (data
[0].count(delim
) ==
353 data
[0].count("%c " % delim
))
354 return (delim
, skipinitialspace
)
356 # analyze another chunkLength lines
363 # if there's more than one, fall back to a 'preferred' list
365 for d
in self
.preferred
:
366 if d
in delims
.keys():
367 skipinitialspace
= (data
[0].count(d
) ==
368 data
[0].count("%c " % d
))
369 return (d
, skipinitialspace
)
371 # nothing else indicates a preference, pick the character that
373 items
= [(v
,k
) for (k
,v
) in delims
.items()]
377 skipinitialspace
= (data
[0].count(delim
) ==
378 data
[0].count("%c " % delim
))
379 return (delim
, skipinitialspace
)
382 def has_header(self
, sample
):
383 # Creates a dictionary of types of data in each column. If any
384 # column is of a single type (say, integers), *except* for the first
385 # row, then the first row is presumed to be labels. If the type
386 # can't be determined, it is assumed to be a string in which case
387 # the length of the string is the determining factor: if all of the
388 # rows except for the first are the same length, it's a header.
389 # Finally, a 'vote' is taken at the end for each column, adding or
390 # subtracting from the likelihood of the first row being a header.
392 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
394 header
= rdr
.next() # assume first row is header
396 columns
= len(header
)
398 for i
in range(columns
): columnTypes
[i
] = None
402 # arbitrary number of rows to check, to keep it sane
407 if len(row
) != columns
:
408 continue # skip rows that have irregular number of columns
410 for col
in columnTypes
.keys():
412 for thisType
in [int, long, float, complex]:
416 except (ValueError, OverflowError):
419 # fallback to length of string
420 thisType
= len(row
[col
])
422 # treat longs as ints
426 if thisType
!= columnTypes
[col
]:
427 if columnTypes
[col
] is None: # add new column type
428 columnTypes
[col
] = thisType
430 # type is inconsistent, remove column from
434 # finally, compare results against first row and "vote"
435 # on whether it's a header
437 for col
, colType
in columnTypes
.items():
438 if type(colType
) == type(0): # it's a length
439 if len(header
[col
]) != colType
:
443 else: # attempt typecast
446 except (ValueError, TypeError):