large_data.py - cs107-lecture-examples - Example codes used during Harvard CS107 lectures

large_data.py (5620B)
      1 #!/usr/bin/env python3
      2 # vim: foldmethod=marker
      3 # File       : large_data.py
      4 # Description: Example: using generators for reading (large) data files
      5 # Copyright 2022 Harvard University. All Rights Reserved.
      6 import os
      7 import argparse
      8 from itertools import chain
      9 
     10 import numpy as np
     11 
     12 
     13 # argument parser {{{1
     14 def parse_args(*, partial=False):
     15     parser = argparse.ArgumentParser(description="Generator example with large data.")
     16     # yapf: disable
     17     parser.add_argument('-g', '--generate', action='store_true',
     18             help="Generate the test data (will take up `size` MB of disk space)")
     19     parser.add_argument('-e', '--eager', action='store_true',
     20             help="Include data source obtained with eager reader.")
     21     parser.add_argument('-s', '--size', type=int, default=2048,
     22             help="Size of test data file in MB.  Default is 2048MB.")
     23     parser.add_argument('-c', '--chunk_size', type=int, default=1,
     24             help="Chunk size for lazy load of binary data file.")
     25     # yapf: enable
     26     if partial:
     27         return parser.parse_known_args()
     28     else:
     29         return parser.parse_args()
     30 
     31 
     32 # process data {{{1
     33 def process(*data_iterables):
     34     """
     35     Process data from multiple sources.
     36 
     37     In this example the data elements are expected to be scalars and are just
     38     summed up.  In practice here is where you do the actual work.
     39 
     40     Example:
     41     --------
     42     Note that data iterables can be iterables, iterators or generators.  The
     43     example below uses a list and tuple to demonstrate.
     44 
     45     >>> process([1, 2, 3], (4, 5))
     46     15
     47 
     48     """
     49     val = 0
     50     item_count = 0
     51     for item in chain.from_iterable(data_iterables):
     52         val += item
     53         item_count += 1
     54         if item_count % 10000 == 0:
     55             print(f'{item_count} items processed from input')
     56     return val
     57 
     58 
     59 # data readers (generators) {{{1
     60 # lazy binary data reader (generator) {{{2
     61 def read_binary_lazy(fname, chunk_size=1):
     62     """
     63     Lazy binary data loader (generator).
     64 
     65     Load data from a binary file specific to an application.  This loader is
     66     using a generator to load the data lazy upon request in chunk sizes
     67     specified by `chunk_size`.
     68 
     69     Parameters
     70     ----------
     71     fname : str
     72         Path to data file.
     73     chunk_size : int
     74         Number of data elements to be loaded from the file.  This parameter can
     75         optimize the bandwidth for reading the file.  Reading small sizes from a
     76         file is not efficient.
     77 
     78     Yields
     79     ------
     80     float
     81         One data element from the data file.
     82 
     83     """
     84 
     85     with open(fname, 'r') as f:
     86         # Assignment expressions `:=` Python 3.8 and beyond only
     87         # (see https://peps.python.org/pep-0572/)
     88         while (chunk := np.fromfile(f, dtype=float, count=chunk_size)).size > 0:
     89             for item in chunk:
     90                 yield item
     91 
     92 
     93 # eager binary data reader (iterable) {{{2
     94 def read_binary_eager(fname):
     95     """
     96     Eager binary data loader (iterable numpy array).
     97 
     98     Load data from a binary file specific to an application.  This loader is
     99     reading all content in the file eagerly.  It may consume a large amount of
    100     RAM on your system if the file is large.
    101 
    102     Parameters
    103     ----------
    104     fname : str
    105         Path to data file.
    106 
    107     Returns
    108     -------
    109     array_like
    110         NumPy array of data in file.
    111 
    112     """
    113     with open(fname, 'r') as f:
    114         return np.fromfile(f, dtype=float)
    115 
    116 
    117 # lazy text data reader (generator) {{{2
    118 def read_ascii_lazy(fname, sep=','):
    119     """
    120     Lazy ASCII data loader (generator).
    121 
    122     Alternative ASCII data loader for demonstration purpose.  Assume you are
    123     working with data sources that deliver the data in either binary form or
    124     ASCII text.  This additional data reader allows to read the ASCII version of
    125     the data sources.
    126 
    127     Parameters
    128     ----------
    129     fname : str
    130         Path to data file.
    131 
    132     Yields
    133     ------
    134     float
    135         One data element from the data file.
    136     """
    137     with open(fname, 'r') as f:
    138         # Assignment expressions `:=` Python 3.8 and beyond only
    139         # (see https://peps.python.org/pep-0572/)
    140         while (item := np.fromfile(f, count=1, sep=sep)).size > 0:
    141             yield item
    142 
    143 
    144 # data generation {{{1
    145 def gen_data(nelements, fname, sep=''):
    146     """
    147     Generate test data files.
    148 
    149     Parameters
    150     ----------
    151     nelements : int
    152         Number of random data elements.
    153     fname : str
    154         Path to data file.
    155     sep : str
    156         Data separator.  Binary file is generated if empty string.
    157 
    158     """
    159     x = np.random.rand(nelements)
    160     x.tofile(fname, sep=sep)
    161 
    162 
    163 # main() {{{1
    164 def main(args):
    165     # generate data {{{2
    166     if args.generate:
    167         n_floats = args.size * 1024 * 1024 // 8
    168         gen_data(n_floats, 'data.bin')  # random binary data
    169         gen_data(8, 'data.txt', sep=',')  # random text data
    170 
    171     # process data {{{2
    172     if os.path.isfile('data.bin') and os.path.isfile('data.txt'):
    173         sources = []
    174 
    175         # use an eager reader for demonstration, a large file will occupy a lot
    176         # of RAM
    177         if args.eager:
    178             sources.append(read_binary_eager('data.bin'))
    179 
    180         # performance demonstration is based on the lazy_binary data reader
    181         sources.append(read_binary_lazy('data.bin', args.chunk_size))
    182 
    183         # ASCII reader is just for additional demonstration that you can easily
    184         # treat different data sources in Python
    185         sources.append(read_ascii_lazy('data.txt'))
    186 
    187         # process all these sources
    188         result = process(*sources)
    189         print('Processed result:', result)
    190 
    191 
    192 if __name__ == "__main__":
    193     args = parse_args()
    194     main(args)
	cs107-lecture-examples Example codes used during Harvard CS107 lectures
	git clone https://git.0xfab.ch/cs107-lecture-examples.git
	Log \| Files \| Refs \| README \| LICENSE