* Simple text input reader. Imported from pyqmc.utils.text_input .

2010-09-27 19:54:05 +00:00
parent 5f367c8fc8
commit cae9cabdea
1 changed files with 211 additions and 0 deletions
--- a/iofmt/text_input.py
+++ b/iofmt/text_input.py
@@ -0,0 +1,211 @@
+#!/usr/bin/python
+# $Id: text_input.py,v 1.1 2010-09-27 19:54:05 wirawan Exp $
+#
+# wpylib.iofmt.text_input module
+# Quick-n-dirty text input utilities
+#
+# Wirawan Purwanto
+# Created: 20090601
+#
+# Routines put here are commonly used in my own scripts.
+# They are not necessarily suitable for general-purpose uses; evaluate
+# your needs and see if they can them as well.
+#
+# 20090601: Created as pyqmc.utils.text_input .
+# 20100927: Moved to wpylib.iofmt.text_input .
+#
+# TODO
+# - book-keep the line number. Also note superfile must have its own line
+#   number keeping.
+#
+"""
+Simple text-based input reader.
+
+This module is part of wpylib project.
+"""
+
+import re
+import numpy
+
+from wpylib.file.file_utils import open_input_file
+
+
+class text_input(object):
+  '''Text input reader with support for UNIX-style comment marker (#) and
+  standard field separation (tabs and whitespaces).
+  Used for quick and dirty data reading (iterating only once in forward
+  direction without the need of rewinding or skipping).
+  This object can be treated like an input file, e.g. used as an iterator,
+  etc.
+
+  To support more fancy options (e.g., rewinding), use "superize=1" when
+  creating the instance.'''
+
+  def __init__(self, fname, **opts):
+    if opts.get("superize", 0):
+      open_opts = { "superize" : opts["superize"] }
+      del opts["superize"]
+    else:
+      open_opts = {}
+    self.file = open_input_file(fname, **open_opts)
+    # field_filtering_proc field can be used to filter unwanted fields, or do
+    # some additional transformations before final feed to the main iteration.
+    self.field_filtering_proc = lambda flds : flds
+    # Default fancy options:
+    self.skip_blank_lines = True
+    if len(opts) > 0:
+      self.set_options(**opts)
+
+  def __del__(self):
+    if getattr(self, "file", None):
+      self.file.close()
+
+  def __iter__(self):
+    return self
+
+  """
+  def next(self):
+    while True:
+      L = self.file.next()
+      F = self.field_filtering_proc(L.split("#")[0].split())
+      if len(F) > 0:
+        return F
+  """
+
+  def next_rec(self):
+    '''Yields the next record, which is already separated into fields.'''
+    while True:
+      L = self.file.next()
+      F = self.field_filtering_proc(L.split("#")[0].split())
+      if len(F) > 0 or not self.skip_blank_lines:
+        return F
+
+  def next_line(self):
+    '''Yields the next line, which is already separated into fields.'''
+    while True:
+      L = self.file.next()
+      F = self.field_filtering_proc(L.split("#")[0].rstrip())
+      if len(F) > 0 or not self.skip_blank_lines:
+        return F
+
+  # Do NOT touch the "next" field below unless you know what you're doing:
+  next = next_line
+
+  def seek_text(self, regex=None, match=None):
+    '''Seeks the file until a particular piece text is encountered.
+    We ignore all comments.
+    The `regex' argument can be either a regex string or a standard python
+    regular expression object.'''
+
+    if regex:
+      if isinstance(regex, str):
+        Regexp = re.compile(regex)
+      else:
+        Regexp = regex
+      match_proc = lambda x: Regexp.search(x)
+    else:
+      match_proc = match
+
+    while True:
+      L = self.next_line()
+      if match_proc(L):
+        return L
+
+
+  def read_floats(self, *cols, **kwd):
+    """Quickly reads a set of floats from a text file.
+    Returns a numpy array of the values in double precision.
+
+    Example usage:
+      >>> arr = text_input("/tmp/file.txt").read_floats(0, 2, 3)
+    to read columns 1, 3, and 4 of the text file /tmp/file.txt, while disregarding
+    comments.
+    """
+    # float_fields extracts the desired columns and converts them to floats
+    float_fields = lambda vals : [ float(vals[col]) for col in cols ]
+    if "maxcount" in kwd:
+      rslt = [ float_fields(vals.split()) for (c,vals) in zip(xrange(kwd['maxcount']),self) ]
+    else:
+      rslt = [ float_fields(vals.split()) for vals in self ]
+    # finally convert them to a numpy ndarray:
+    return numpy.array(rslt)
+
+  def read_items(self, *col_desc, **kwd):
+    """Quickly reads a set of items from records of whitespace-separated fields
+    in a text file.
+    Returns a structured numpy array of the values read.
+
+    Example usage:
+
+      >>> arr = text_input("/tmp/file.txt").read_items(0, (2, int), (3, "S10", "Atom"))
+
+    reads columns 1 (as floats, by default), 3 (as integers), and 4 (as strings of
+    max length of 10, which field is named "Atom") from the text file /tmp/file.txt,
+    while disregarding comments.
+
+    If the tuple contains the third field, it is used as the name of the field;
+    otherwise the fields are named f0, f1, f2, ....
+
+    Additional keyword options:
+    * deftype: default datatype
+    * maxcount: maximum number of records to be read
+
+    TODO: Needs ability to read in complex data.
+    """
+    deftype = kwd.get("deftype", float)
+
+    # float_fields extracts the desired columns and converts them to floats
+    flds = []
+    cols = []
+    for (i,c) in zip(xrange(len(col_desc)), col_desc):
+      if type(c) == int:
+        cols.append(c)
+        flds.append(('f' + str(i), deftype))
+      elif len(c) == 1:
+        cols.append(c[0])
+        flds.append(('f' + str(i), deftype))
+      elif len(c) == 2:
+        cols.append(c[0])
+        flds.append(('f' + str(i), c[1]))
+      elif len(c) == 3:
+        cols.append(c[0])
+        flds.append((c[2], c[1]))
+
+    #print cols
+    #print flds
+    get_fields = lambda vals : tuple([ vals[col] for col in cols ])
+    if "maxcount" in kwd:
+      #print "hello"
+      rslt = [ get_fields(vals.split()) for (c,vals) in zip(xrange(kwd['maxcount']),self) ]
+    else:
+      rslt = [ get_fields(vals.split()) for vals in self ]
+    #print rslt
+    # finally convert them to a numpy ndarray:
+    return numpy.array(rslt, dtype=flds)
+
+  # Sets fancy options
+  def set_options(self, **opts):
+    for (o,v) in opts.iteritems():
+      if o == "expand_errorbar":
+        self.expand_errorbar(v)
+      if o == "skip_blank_lines":
+        self.skip_blank_lines = v
+      else:
+        raise "ValueError", "Invalid option: %s" % (o,)
+    return self
+
+  # Option for errorbar expansion:
+  def expand_errorbar(self, v=True):
+    '''Enables or disables errorbar expansion.'''
+    if v:
+      self.opt_expand_errorbar = True
+      self.field_filtering_proc = self.expand_errorbar_hook
+    else:
+      self.opt_expand_errorbar = False
+      self.field_filtering_proc = lambda flds : flds
+    return self
+
+  def expand_errorbar_hook(self, F):
+    # A hook for field_filtering_proc for expanding errorbars:
+    from pyqmc.stats.errorbar import expand
+    return expand(F, flatten=True)