AppPkg/Applications/Python/Python-2.7.10/Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from functools import reduce
   8 from _csv import Error, __version__, writer, reader, register_dialect, \
   9                  unregister_dialect, get_dialect, list_dialects, \
  10                  field_size_limit, \
  11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  12                  __doc__
  13 from _csv import Dialect as _Dialect
  14
  15 try:
  16     from cStringIO import StringIO
  17 except ImportError:
  18     from StringIO import StringIO
  19
  20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  21             "Error", "Dialect", "__doc__", "excel", "excel_tab",
  22             "field_size_limit", "reader", "writer",
  23             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  24             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  25
  26 class Dialect:
  27     """Describe an Excel dialect.
  28
  29     This must be subclassed (see csv.excel).  Valid attributes are:
  30     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  31     lineterminator, quoting.
  32
  33     """
  34     _name = ""
  35     _valid = False
  36     # placeholders
  37     delimiter = None
  38     quotechar = None
  39     escapechar = None
  40     doublequote = None
  41     skipinitialspace = None
  42     lineterminator = None
  43     quoting = None
  44
  45     def __init__(self):
  46         if self.__class__ != Dialect:
  47             self._valid = True
  48         self._validate()
  49
  50     def _validate(self):
  51         try:
  52             _Dialect(self)
  53         except TypeError, e:
  54             # We do this for compatibility with py2.3
  55             raise Error(str(e))
  56
  57 class excel(Dialect):
  58     """Describe the usual properties of Excel-generated CSV files."""
  59     delimiter = ','
  60     quotechar = '"'
  61     doublequote = True
  62     skipinitialspace = False
  63     lineterminator = '\r\n'
  64     quoting = QUOTE_MINIMAL
  65 register_dialect("excel", excel)
  66
  67 class excel_tab(excel):
  68     """Describe the usual properties of Excel-generated TAB-delimited files."""
  69     delimiter = '\t'
  70 register_dialect("excel-tab", excel_tab)
  71
  72
  73 class DictReader:
  74     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  75                  dialect="excel", *args, **kwds):
  76         self._fieldnames = fieldnames   # list of keys for the dict
  77         self.restkey = restkey          # key to catch long rows
  78         self.restval = restval          # default value for short rows
  79         self.reader = reader(f, dialect, *args, **kwds)
  80         self.dialect = dialect
  81         self.line_num = 0
  82
  83     def __iter__(self):
  84         return self
  85
  86     @property
  87     def fieldnames(self):
  88         if self._fieldnames is None:
  89             try:
  90                 self._fieldnames = self.reader.next()
  91             except StopIteration:
  92                 pass
  93         self.line_num = self.reader.line_num
  94         return self._fieldnames
  95
  96     # Issue 20004: Because DictReader is a classic class, this setter is
  97     # ignored.  At this point in 2.7's lifecycle, it is too late to change the
  98     # base class for fear of breaking working code.  If you want to change
  99     # fieldnames without overwriting the getter, set _fieldnames directly.
 100     @fieldnames.setter
 101     def fieldnames(self, value):
 102         self._fieldnames = value
 103
 104     def next(self):
 105         if self.line_num == 0:
 106             # Used only for its side effect.
 107             self.fieldnames
 108         row = self.reader.next()
 109         self.line_num = self.reader.line_num
 110
 111         # unlike the basic reader, we prefer not to return blanks,
 112         # because we will typically wind up with a dict full of None
 113         # values
 114         while row == []:
 115             row = self.reader.next()
 116         d = dict(zip(self.fieldnames, row))
 117         lf = len(self.fieldnames)
 118         lr = len(row)
 119         if lf < lr:
 120             d[self.restkey] = row[lf:]
 121         elif lf > lr:
 122             for key in self.fieldnames[lr:]:
 123                 d[key] = self.restval
 124         return d
 125
 126
 127 class DictWriter:
 128     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 129                  dialect="excel", *args, **kwds):
 130         self.fieldnames = fieldnames    # list of keys for the dict
 131         self.restval = restval          # for writing short dicts
 132         if extrasaction.lower() not in ("raise", "ignore"):
 133             raise ValueError, \
 134                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 135                    extrasaction)
 136         self.extrasaction = extrasaction
 137         self.writer = writer(f, dialect, *args, **kwds)
 138
 139     def writeheader(self):
 140         header = dict(zip(self.fieldnames, self.fieldnames))
 141         self.writerow(header)
 142
 143     def _dict_to_list(self, rowdict):
 144         if self.extrasaction == "raise":
 145             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
 146             if wrong_fields:
 147                 raise ValueError("dict contains fields not in fieldnames: "
 148                                  + ", ".join([repr(x) for x in wrong_fields]))
 149         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 150
 151     def writerow(self, rowdict):
 152         return self.writer.writerow(self._dict_to_list(rowdict))
 153
 154     def writerows(self, rowdicts):
 155         rows = []
 156         for rowdict in rowdicts:
 157             rows.append(self._dict_to_list(rowdict))
 158         return self.writer.writerows(rows)
 159
 160 # Guard Sniffer's type checking against builds that exclude complex()
 161 try:
 162     complex
 163 except NameError:
 164     complex = float
 165
 166 class Sniffer:
 167     '''
 168     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 169     Returns a Dialect object.
 170     '''
 171     def __init__(self):
 172         # in case there is more than one possible delimiter
 173         self.preferred = [',', '\t', ';', ' ', ':']
 174
 175
 176     def sniff(self, sample, delimiters=None):
 177         """
 178         Returns a dialect (or None) corresponding to the sample
 179         """
 180
 181         quotechar, doublequote, delimiter, skipinitialspace = \
 182                    self._guess_quote_and_delimiter(sample, delimiters)
 183         if not delimiter:
 184             delimiter, skipinitialspace = self._guess_delimiter(sample,
 185                                                                 delimiters)
 186
 187         if not delimiter:
 188             raise Error, "Could not determine delimiter"
 189
 190         class dialect(Dialect):
 191             _name = "sniffed"
 192             lineterminator = '\r\n'
 193             quoting = QUOTE_MINIMAL
 194             # escapechar = ''
 195
 196         dialect.doublequote = doublequote
 197         dialect.delimiter = delimiter
 198         # _csv.reader won't accept a quotechar of ''
 199         dialect.quotechar = quotechar or '"'
 200         dialect.skipinitialspace = skipinitialspace
 201
 202         return dialect
 203
 204
 205     def _guess_quote_and_delimiter(self, data, delimiters):
 206         """
 207         Looks for text enclosed between two identical quotes
 208         (the probable quotechar) which are preceded and followed
 209         by the same character (the probable delimiter).
 210         For example:
 211                          ,'some text',
 212         The quote with the most wins, same with the delimiter.
 213         If there is no quotechar the delimiter can't be determined
 214         this way.
 215         """
 216
 217         matches = []
 218         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 219                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 220                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 221                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 222             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 223             matches = regexp.findall(data)
 224             if matches:
 225                 break
 226
 227         if not matches:
 228             # (quotechar, doublequote, delimiter, skipinitialspace)
 229             return ('', False, None, 0)
 230         quotes = {}
 231         delims = {}
 232         spaces = 0
 233         for m in matches:
 234             n = regexp.groupindex['quote'] - 1
 235             key = m[n]
 236             if key:
 237                 quotes[key] = quotes.get(key, 0) + 1
 238             try:
 239                 n = regexp.groupindex['delim'] - 1
 240                 key = m[n]
 241             except KeyError:
 242                 continue
 243             if key and (delimiters is None or key in delimiters):
 244                 delims[key] = delims.get(key, 0) + 1
 245             try:
 246                 n = regexp.groupindex['space'] - 1
 247             except KeyError:
 248                 continue
 249             if m[n]:
 250                 spaces += 1
 251
 252         quotechar = reduce(lambda a, b, quotes = quotes:
 253                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 254
 255         if delims:
 256             delim = reduce(lambda a, b, delims = delims:
 257                            (delims[a] > delims[b]) and a or b, delims.keys())
 258             skipinitialspace = delims[delim] == spaces
 259             if delim == '\n': # most likely a file with a single column
 260                 delim = ''
 261         else:
 262             # there is *no* delimiter, it's a single column of quoted data
 263             delim = ''
 264             skipinitialspace = 0
 265
 266         # if we see an extra quote between delimiters, we've got a
 267         # double quoted format
 268         dq_regexp = re.compile(
 269                                r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
 270                                {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
 271
 272
 273
 274         if dq_regexp.search(data):
 275             doublequote = True
 276         else:
 277             doublequote = False
 278
 279         return (quotechar, doublequote, delim, skipinitialspace)
 280
 281
 282     def _guess_delimiter(self, data, delimiters):
 283         """
 284         The delimiter /should/ occur the same number of times on
 285         each row. However, due to malformed data, it may not. We don't want
 286         an all or nothing approach, so we allow for small variations in this
 287         number.
 288           1) build a table of the frequency of each character on every line.
 289           2) build a table of frequencies of this frequency (meta-frequency?),
 290              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 291              7 times in 2 rows'
 292           3) use the mode of the meta-frequency to determine the /expected/
 293              frequency for that character
 294           4) find out how often the character actually meets that goal
 295           5) the character that best meets its goal is the delimiter
 296         For performance reasons, the data is evaluated in chunks, so it can
 297         try and evaluate the smallest portion of the data possible, evaluating
 298         additional chunks as necessary.
 299         """
 300
 301         data = filter(None, data.split('\n'))
 302
 303         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 304
 305         # build frequency tables
 306         chunkLength = min(10, len(data))
 307         iteration = 0
 308         charFrequency = {}
 309         modes = {}
 310         delims = {}
 311         start, end = 0, min(chunkLength, len(data))
 312         while start < len(data):
 313             iteration += 1
 314             for line in data[start:end]:
 315                 for char in ascii:
 316                     metaFrequency = charFrequency.get(char, {})
 317                     # must count even if frequency is 0
 318                     freq = line.count(char)
 319                     # value is the mode
 320                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 321                     charFrequency[char] = metaFrequency
 322
 323             for char in charFrequency.keys():
 324                 items = charFrequency[char].items()
 325                 if len(items) == 1 and items[0][0] == 0:
 326                     continue
 327                 # get the mode of the frequencies
 328                 if len(items) > 1:
 329                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 330                                          items)
 331                     # adjust the mode - subtract the sum of all
 332                     # other frequencies
 333                     items.remove(modes[char])
 334                     modes[char] = (modes[char][0], modes[char][1]
 335                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 336                                             items)[1])
 337                 else:
 338                     modes[char] = items[0]
 339
 340             # build a list of possible delimiters
 341             modeList = modes.items()
 342             total = float(chunkLength * iteration)
 343             # (rows of consistent data) / (number of rows) = 100%
 344             consistency = 1.0
 345             # minimum consistency threshold
 346             threshold = 0.9
 347             while len(delims) == 0 and consistency >= threshold:
 348                 for k, v in modeList:
 349                     if v[0] > 0 and v[1] > 0:
 350                         if ((v[1]/total) >= consistency and
 351                             (delimiters is None or k in delimiters)):
 352                             delims[k] = v
 353                 consistency -= 0.01
 354
 355             if len(delims) == 1:
 356                 delim = delims.keys()[0]
 357                 skipinitialspace = (data[0].count(delim) ==
 358                                     data[0].count("%c " % delim))
 359                 return (delim, skipinitialspace)
 360
 361             # analyze another chunkLength lines
 362             start = end
 363             end += chunkLength
 364
 365         if not delims:
 366             return ('', 0)
 367
 368         # if there's more than one, fall back to a 'preferred' list
 369         if len(delims) > 1:
 370             for d in self.preferred:
 371                 if d in delims.keys():
 372                     skipinitialspace = (data[0].count(d) ==
 373                                         data[0].count("%c " % d))
 374                     return (d, skipinitialspace)
 375
 376         # nothing else indicates a preference, pick the character that
 377         # dominates(?)
 378         items = [(v,k) for (k,v) in delims.items()]
 379         items.sort()
 380         delim = items[-1][1]
 381
 382         skipinitialspace = (data[0].count(delim) ==
 383                             data[0].count("%c " % delim))
 384         return (delim, skipinitialspace)
 385
 386
 387     def has_header(self, sample):
 388         # Creates a dictionary of types of data in each column. If any
 389         # column is of a single type (say, integers), *except* for the first
 390         # row, then the first row is presumed to be labels. If the type
 391         # can't be determined, it is assumed to be a string in which case
 392         # the length of the string is the determining factor: if all of the
 393         # rows except for the first are the same length, it's a header.
 394         # Finally, a 'vote' is taken at the end for each column, adding or
 395         # subtracting from the likelihood of the first row being a header.
 396
 397         rdr = reader(StringIO(sample), self.sniff(sample))
 398
 399         header = rdr.next() # assume first row is header
 400
 401         columns = len(header)
 402         columnTypes = {}
 403         for i in range(columns): columnTypes[i] = None
 404
 405         checked = 0
 406         for row in rdr:
 407             # arbitrary number of rows to check, to keep it sane
 408             if checked > 20:
 409                 break
 410             checked += 1
 411
 412             if len(row) != columns:
 413                 continue # skip rows that have irregular number of columns
 414
 415             for col in columnTypes.keys():
 416
 417                 for thisType in [int, long, float, complex]:
 418                     try:
 419                         thisType(row[col])
 420                         break
 421                     except (ValueError, OverflowError):
 422                         pass
 423                 else:
 424                     # fallback to length of string
 425                     thisType = len(row[col])
 426
 427                 # treat longs as ints
 428                 if thisType == long:
 429                     thisType = int
 430
 431                 if thisType != columnTypes[col]:
 432                     if columnTypes[col] is None: # add new column type
 433                         columnTypes[col] = thisType
 434                     else:
 435                         # type is inconsistent, remove column from
 436                         # consideration
 437                         del columnTypes[col]
 438
 439         # finally, compare results against first row and "vote"
 440         # on whether it's a header
 441         hasHeader = 0
 442         for col, colType in columnTypes.items():
 443             if type(colType) == type(0): # it's a length
 444                 if len(header[col]) != colType:
 445                     hasHeader += 1
 446                 else:
 447                     hasHeader -= 1
 448             else: # attempt typecast
 449                 try:
 450                     colType(header[col])
 451                 except (ValueError, TypeError):
 452                     hasHeader += 1
 453                 else:
 454                     hasHeader -= 1
 455
 456         return hasHeader > 0