large_data.py (5620B)
1 #!/usr/bin/env python3 2 # vim: foldmethod=marker 3 # File : large_data.py 4 # Description: Example: using generators for reading (large) data files 5 # Copyright 2022 Harvard University. All Rights Reserved. 6 import os 7 import argparse 8 from itertools import chain 9 10 import numpy as np 11 12 13 # argument parser {{{1 14 def parse_args(*, partial=False): 15 parser = argparse.ArgumentParser(description="Generator example with large data.") 16 # yapf: disable 17 parser.add_argument('-g', '--generate', action='store_true', 18 help="Generate the test data (will take up `size` MB of disk space)") 19 parser.add_argument('-e', '--eager', action='store_true', 20 help="Include data source obtained with eager reader.") 21 parser.add_argument('-s', '--size', type=int, default=2048, 22 help="Size of test data file in MB. Default is 2048MB.") 23 parser.add_argument('-c', '--chunk_size', type=int, default=1, 24 help="Chunk size for lazy load of binary data file.") 25 # yapf: enable 26 if partial: 27 return parser.parse_known_args() 28 else: 29 return parser.parse_args() 30 31 32 # process data {{{1 33 def process(*data_iterables): 34 """ 35 Process data from multiple sources. 36 37 In this example the data elements are expected to be scalars and are just 38 summed up. In practice here is where you do the actual work. 39 40 Example: 41 -------- 42 Note that data iterables can be iterables, iterators or generators. The 43 example below uses a list and tuple to demonstrate. 44 45 >>> process([1, 2, 3], (4, 5)) 46 15 47 48 """ 49 val = 0 50 item_count = 0 51 for item in chain.from_iterable(data_iterables): 52 val += item 53 item_count += 1 54 if item_count % 10000 == 0: 55 print(f'{item_count} items processed from input') 56 return val 57 58 59 # data readers (generators) {{{1 60 # lazy binary data reader (generator) {{{2 61 def read_binary_lazy(fname, chunk_size=1): 62 """ 63 Lazy binary data loader (generator). 64 65 Load data from a binary file specific to an application. This loader is 66 using a generator to load the data lazy upon request in chunk sizes 67 specified by `chunk_size`. 68 69 Parameters 70 ---------- 71 fname : str 72 Path to data file. 73 chunk_size : int 74 Number of data elements to be loaded from the file. This parameter can 75 optimize the bandwidth for reading the file. Reading small sizes from a 76 file is not efficient. 77 78 Yields 79 ------ 80 float 81 One data element from the data file. 82 83 """ 84 85 with open(fname, 'r') as f: 86 # Assignment expressions `:=` Python 3.8 and beyond only 87 # (see https://peps.python.org/pep-0572/) 88 while (chunk := np.fromfile(f, dtype=float, count=chunk_size)).size > 0: 89 for item in chunk: 90 yield item 91 92 93 # eager binary data reader (iterable) {{{2 94 def read_binary_eager(fname): 95 """ 96 Eager binary data loader (iterable numpy array). 97 98 Load data from a binary file specific to an application. This loader is 99 reading all content in the file eagerly. It may consume a large amount of 100 RAM on your system if the file is large. 101 102 Parameters 103 ---------- 104 fname : str 105 Path to data file. 106 107 Returns 108 ------- 109 array_like 110 NumPy array of data in file. 111 112 """ 113 with open(fname, 'r') as f: 114 return np.fromfile(f, dtype=float) 115 116 117 # lazy text data reader (generator) {{{2 118 def read_ascii_lazy(fname, sep=','): 119 """ 120 Lazy ASCII data loader (generator). 121 122 Alternative ASCII data loader for demonstration purpose. Assume you are 123 working with data sources that deliver the data in either binary form or 124 ASCII text. This additional data reader allows to read the ASCII version of 125 the data sources. 126 127 Parameters 128 ---------- 129 fname : str 130 Path to data file. 131 132 Yields 133 ------ 134 float 135 One data element from the data file. 136 """ 137 with open(fname, 'r') as f: 138 # Assignment expressions `:=` Python 3.8 and beyond only 139 # (see https://peps.python.org/pep-0572/) 140 while (item := np.fromfile(f, count=1, sep=sep)).size > 0: 141 yield item 142 143 144 # data generation {{{1 145 def gen_data(nelements, fname, sep=''): 146 """ 147 Generate test data files. 148 149 Parameters 150 ---------- 151 nelements : int 152 Number of random data elements. 153 fname : str 154 Path to data file. 155 sep : str 156 Data separator. Binary file is generated if empty string. 157 158 """ 159 x = np.random.rand(nelements) 160 x.tofile(fname, sep=sep) 161 162 163 # main() {{{1 164 def main(args): 165 # generate data {{{2 166 if args.generate: 167 n_floats = args.size * 1024 * 1024 // 8 168 gen_data(n_floats, 'data.bin') # random binary data 169 gen_data(8, 'data.txt', sep=',') # random text data 170 171 # process data {{{2 172 if os.path.isfile('data.bin') and os.path.isfile('data.txt'): 173 sources = [] 174 175 # use an eager reader for demonstration, a large file will occupy a lot 176 # of RAM 177 if args.eager: 178 sources.append(read_binary_eager('data.bin')) 179 180 # performance demonstration is based on the lazy_binary data reader 181 sources.append(read_binary_lazy('data.bin', args.chunk_size)) 182 183 # ASCII reader is just for additional demonstration that you can easily 184 # treat different data sources in Python 185 sources.append(read_ascii_lazy('data.txt')) 186 187 # process all these sources 188 result = process(*sources) 189 print('Processed result:', result) 190 191 192 if __name__ == "__main__": 193 args = parse_args() 194 main(args)