AppPkg/Applications/Python/Python-2.7.2/Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from functools import reduce
   8 from _csv import Error, __version__, writer, reader, register_dialect, \
   9                  unregister_dialect, get_dialect, list_dialects, \
  10                  field_size_limit, \
  11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  12                  __doc__
  13 from _csv import Dialect as _Dialect
  14
  15 try:
  16     from cStringIO import StringIO
  17 except ImportError:
  18     from StringIO import StringIO
  19
  20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  21             "Error", "Dialect", "__doc__", "excel", "excel_tab",
  22             "field_size_limit", "reader", "writer",
  23             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  24             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  25
  26 class Dialect:
  27     """Describe an Excel dialect.
  28
  29     This must be subclassed (see csv.excel).  Valid attributes are:
  30     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  31     lineterminator, quoting.
  32
  33     """
  34     _name = ""
  35     _valid = False
  36     # placeholders
  37     delimiter = None
  38     quotechar = None
  39     escapechar = None
  40     doublequote = None
  41     skipinitialspace = None
  42     lineterminator = None
  43     quoting = None
  44
  45     def __init__(self):
  46         if self.__class__ != Dialect:
  47             self._valid = True
  48         self._validate()
  49
  50     def _validate(self):
  51         try:
  52             _Dialect(self)
  53         except TypeError, e:
  54             # We do this for compatibility with py2.3
  55             raise Error(str(e))
  56
  57 class excel(Dialect):
  58     """Describe the usual properties of Excel-generated CSV files."""
  59     delimiter = ','
  60     quotechar = '"'
  61     doublequote = True
  62     skipinitialspace = False
  63     lineterminator = '\r\n'
  64     quoting = QUOTE_MINIMAL
  65 register_dialect("excel", excel)
  66
  67 class excel_tab(excel):
  68     """Describe the usual properties of Excel-generated TAB-delimited files."""
  69     delimiter = '\t'
  70 register_dialect("excel-tab", excel_tab)
  71
  72
  73 class DictReader:
  74     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  75                  dialect="excel", *args, **kwds):
  76         self._fieldnames = fieldnames   # list of keys for the dict
  77         self.restkey = restkey          # key to catch long rows
  78         self.restval = restval          # default value for short rows
  79         self.reader = reader(f, dialect, *args, **kwds)
  80         self.dialect = dialect
  81         self.line_num = 0
  82
  83     def __iter__(self):
  84         return self
  85
  86     @property
  87     def fieldnames(self):
  88         if self._fieldnames is None:
  89             try:
  90                 self._fieldnames = self.reader.next()
  91             except StopIteration:
  92                 pass
  93         self.line_num = self.reader.line_num
  94         return self._fieldnames
  95
  96     @fieldnames.setter
  97     def fieldnames(self, value):
  98         self._fieldnames = value
  99
 100     def next(self):
 101         if self.line_num == 0:
 102             # Used only for its side effect.
 103             self.fieldnames
 104         row = self.reader.next()
 105         self.line_num = self.reader.line_num
 106
 107         # unlike the basic reader, we prefer not to return blanks,
 108         # because we will typically wind up with a dict full of None
 109         # values
 110         while row == []:
 111             row = self.reader.next()
 112         d = dict(zip(self.fieldnames, row))
 113         lf = len(self.fieldnames)
 114         lr = len(row)
 115         if lf < lr:
 116             d[self.restkey] = row[lf:]
 117         elif lf > lr:
 118             for key in self.fieldnames[lr:]:
 119                 d[key] = self.restval
 120         return d
 121
 122
 123 class DictWriter:
 124     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 125                  dialect="excel", *args, **kwds):
 126         self.fieldnames = fieldnames    # list of keys for the dict
 127         self.restval = restval          # for writing short dicts
 128         if extrasaction.lower() not in ("raise", "ignore"):
 129             raise ValueError, \
 130                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 131                    extrasaction)
 132         self.extrasaction = extrasaction
 133         self.writer = writer(f, dialect, *args, **kwds)
 134
 135     def writeheader(self):
 136         header = dict(zip(self.fieldnames, self.fieldnames))
 137         self.writerow(header)
 138
 139     def _dict_to_list(self, rowdict):
 140         if self.extrasaction == "raise":
 141             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
 142             if wrong_fields:
 143                 raise ValueError("dict contains fields not in fieldnames: " +
 144                                  ", ".join(wrong_fields))
 145         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 146
 147     def writerow(self, rowdict):
 148         return self.writer.writerow(self._dict_to_list(rowdict))
 149
 150     def writerows(self, rowdicts):
 151         rows = []
 152         for rowdict in rowdicts:
 153             rows.append(self._dict_to_list(rowdict))
 154         return self.writer.writerows(rows)
 155
 156 # Guard Sniffer's type checking against builds that exclude complex()
 157 try:
 158     complex
 159 except NameError:
 160     complex = float
 161
 162 class Sniffer:
 163     '''
 164     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 165     Returns a Dialect object.
 166     '''
 167     def __init__(self):
 168         # in case there is more than one possible delimiter
 169         self.preferred = [',', '\t', ';', ' ', ':']
 170
 171
 172     def sniff(self, sample, delimiters=None):
 173         """
 174         Returns a dialect (or None) corresponding to the sample
 175         """
 176
 177         quotechar, doublequote, delimiter, skipinitialspace = \
 178                    self._guess_quote_and_delimiter(sample, delimiters)
 179         if not delimiter:
 180             delimiter, skipinitialspace = self._guess_delimiter(sample,
 181                                                                 delimiters)
 182
 183         if not delimiter:
 184             raise Error, "Could not determine delimiter"
 185
 186         class dialect(Dialect):
 187             _name = "sniffed"
 188             lineterminator = '\r\n'
 189             quoting = QUOTE_MINIMAL
 190             # escapechar = ''
 191
 192         dialect.doublequote = doublequote
 193         dialect.delimiter = delimiter
 194         # _csv.reader won't accept a quotechar of ''
 195         dialect.quotechar = quotechar or '"'
 196         dialect.skipinitialspace = skipinitialspace
 197
 198         return dialect
 199
 200
 201     def _guess_quote_and_delimiter(self, data, delimiters):
 202         """
 203         Looks for text enclosed between two identical quotes
 204         (the probable quotechar) which are preceded and followed
 205         by the same character (the probable delimiter).
 206         For example:
 207                          ,'some text',
 208         The quote with the most wins, same with the delimiter.
 209         If there is no quotechar the delimiter can't be determined
 210         this way.
 211         """
 212
 213         matches = []
 214         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 215                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 216                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 217                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 218             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 219             matches = regexp.findall(data)
 220             if matches:
 221                 break
 222
 223         if not matches:
 224             # (quotechar, doublequote, delimiter, skipinitialspace)
 225             return ('', False, None, 0)
 226         quotes = {}
 227         delims = {}
 228         spaces = 0
 229         for m in matches:
 230             n = regexp.groupindex['quote'] - 1
 231             key = m[n]
 232             if key:
 233                 quotes[key] = quotes.get(key, 0) + 1
 234             try:
 235                 n = regexp.groupindex['delim'] - 1
 236                 key = m[n]
 237             except KeyError:
 238                 continue
 239             if key and (delimiters is None or key in delimiters):
 240                 delims[key] = delims.get(key, 0) + 1
 241             try:
 242                 n = regexp.groupindex['space'] - 1
 243             except KeyError:
 244                 continue
 245             if m[n]:
 246                 spaces += 1
 247
 248         quotechar = reduce(lambda a, b, quotes = quotes:
 249                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 250
 251         if delims:
 252             delim = reduce(lambda a, b, delims = delims:
 253                            (delims[a] > delims[b]) and a or b, delims.keys())
 254             skipinitialspace = delims[delim] == spaces
 255             if delim == '\n': # most likely a file with a single column
 256                 delim = ''
 257         else:
 258             # there is *no* delimiter, it's a single column of quoted data
 259             delim = ''
 260             skipinitialspace = 0
 261
 262         # if we see an extra quote between delimiters, we've got a
 263         # double quoted format
 264         dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
 265                                {'delim':delim, 'quote':quotechar}, re.MULTILINE)
 266
 267
 268
 269         if dq_regexp.search(data):
 270             doublequote = True
 271         else:
 272             doublequote = False
 273
 274         return (quotechar, doublequote, delim, skipinitialspace)
 275
 276
 277     def _guess_delimiter(self, data, delimiters):
 278         """
 279         The delimiter /should/ occur the same number of times on
 280         each row. However, due to malformed data, it may not. We don't want
 281         an all or nothing approach, so we allow for small variations in this
 282         number.
 283           1) build a table of the frequency of each character on every line.
 284           2) build a table of frequencies of this frequency (meta-frequency?),
 285              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 286              7 times in 2 rows'
 287           3) use the mode of the meta-frequency to determine the /expected/
 288              frequency for that character
 289           4) find out how often the character actually meets that goal
 290           5) the character that best meets its goal is the delimiter
 291         For performance reasons, the data is evaluated in chunks, so it can
 292         try and evaluate the smallest portion of the data possible, evaluating
 293         additional chunks as necessary.
 294         """
 295
 296         data = filter(None, data.split('\n'))
 297
 298         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 299
 300         # build frequency tables
 301         chunkLength = min(10, len(data))
 302         iteration = 0
 303         charFrequency = {}
 304         modes = {}
 305         delims = {}
 306         start, end = 0, min(chunkLength, len(data))
 307         while start < len(data):
 308             iteration += 1
 309             for line in data[start:end]:
 310                 for char in ascii:
 311                     metaFrequency = charFrequency.get(char, {})
 312                     # must count even if frequency is 0
 313                     freq = line.count(char)
 314                     # value is the mode
 315                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 316                     charFrequency[char] = metaFrequency
 317
 318             for char in charFrequency.keys():
 319                 items = charFrequency[char].items()
 320                 if len(items) == 1 and items[0][0] == 0:
 321                     continue
 322                 # get the mode of the frequencies
 323                 if len(items) > 1:
 324                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 325                                          items)
 326                     # adjust the mode - subtract the sum of all
 327                     # other frequencies
 328                     items.remove(modes[char])
 329                     modes[char] = (modes[char][0], modes[char][1]
 330                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 331                                             items)[1])
 332                 else:
 333                     modes[char] = items[0]
 334
 335             # build a list of possible delimiters
 336             modeList = modes.items()
 337             total = float(chunkLength * iteration)
 338             # (rows of consistent data) / (number of rows) = 100%
 339             consistency = 1.0
 340             # minimum consistency threshold
 341             threshold = 0.9
 342             while len(delims) == 0 and consistency >= threshold:
 343                 for k, v in modeList:
 344                     if v[0] > 0 and v[1] > 0:
 345                         if ((v[1]/total) >= consistency and
 346                             (delimiters is None or k in delimiters)):
 347                             delims[k] = v
 348                 consistency -= 0.01
 349
 350             if len(delims) == 1:
 351                 delim = delims.keys()[0]
 352                 skipinitialspace = (data[0].count(delim) ==
 353                                     data[0].count("%c " % delim))
 354                 return (delim, skipinitialspace)
 355
 356             # analyze another chunkLength lines
 357             start = end
 358             end += chunkLength
 359
 360         if not delims:
 361             return ('', 0)
 362
 363         # if there's more than one, fall back to a 'preferred' list
 364         if len(delims) > 1:
 365             for d in self.preferred:
 366                 if d in delims.keys():
 367                     skipinitialspace = (data[0].count(d) ==
 368                                         data[0].count("%c " % d))
 369                     return (d, skipinitialspace)
 370
 371         # nothing else indicates a preference, pick the character that
 372         # dominates(?)
 373         items = [(v,k) for (k,v) in delims.items()]
 374         items.sort()
 375         delim = items[-1][1]
 376
 377         skipinitialspace = (data[0].count(delim) ==
 378                             data[0].count("%c " % delim))
 379         return (delim, skipinitialspace)
 380
 381
 382     def has_header(self, sample):
 383         # Creates a dictionary of types of data in each column. If any
 384         # column is of a single type (say, integers), *except* for the first
 385         # row, then the first row is presumed to be labels. If the type
 386         # can't be determined, it is assumed to be a string in which case
 387         # the length of the string is the determining factor: if all of the
 388         # rows except for the first are the same length, it's a header.
 389         # Finally, a 'vote' is taken at the end for each column, adding or
 390         # subtracting from the likelihood of the first row being a header.
 391
 392         rdr = reader(StringIO(sample), self.sniff(sample))
 393
 394         header = rdr.next() # assume first row is header
 395
 396         columns = len(header)
 397         columnTypes = {}
 398         for i in range(columns): columnTypes[i] = None
 399
 400         checked = 0
 401         for row in rdr:
 402             # arbitrary number of rows to check, to keep it sane
 403             if checked > 20:
 404                 break
 405             checked += 1
 406
 407             if len(row) != columns:
 408                 continue # skip rows that have irregular number of columns
 409
 410             for col in columnTypes.keys():
 411
 412                 for thisType in [int, long, float, complex]:
 413                     try:
 414                         thisType(row[col])
 415                         break
 416                     except (ValueError, OverflowError):
 417                         pass
 418                 else:
 419                     # fallback to length of string
 420                     thisType = len(row[col])
 421
 422                 # treat longs as ints
 423                 if thisType == long:
 424                     thisType = int
 425
 426                 if thisType != columnTypes[col]:
 427                     if columnTypes[col] is None: # add new column type
 428                         columnTypes[col] = thisType
 429                     else:
 430                         # type is inconsistent, remove column from
 431                         # consideration
 432                         del columnTypes[col]
 433
 434         # finally, compare results against first row and "vote"
 435         # on whether it's a header
 436         hasHeader = 0
 437         for col, colType in columnTypes.items():
 438             if type(colType) == type(0): # it's a length
 439                 if len(header[col]) != colType:
 440                     hasHeader += 1
 441                 else:
 442                     hasHeader -= 1
 443             else: # attempt typecast
 444                 try:
 445                     colType(header[col])
 446                 except (ValueError, TypeError):
 447                     hasHeader += 1
 448                 else:
 449                     hasHeader -= 1
 450
 451         return hasHeader > 0