ihs29x
1# (c) 2023 terminus, LLC 2# Permission is hereby granted, free of charge, to any person obtaining a copy 3# of this software and associated documentation files (the "Software"), to deal 4# in the Software without restriction, including without limitation the rights 5# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 6# copies of the Software, and to permit persons to whom the Software is 7# furnished to do so, subject to the following conditions: 8# 9# The above copyright notice and this permission notice shall be included in all 10# copies or substantial portions of the Software. 11# 12# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18# SOFTWARE. 19 20import json 21import warnings 22from datetime import date, datetime 23 24import importlib_resources as ir 25 26from typing import ( 27 Any, Callable, Iterator, NamedTuple, TextIO, TypeAlias, TypedDict 28) 29 30 31__version__ = '0.1.0' 32 33 34SPEC297 = json.loads( 35 ir.files('ihs29x').joinpath('TwoNinetySeven.json').read_bytes()) 36SPEC298 = json.loads( 37 ir.files('ihs29x').joinpath('TwoNinetyEight.json').read_bytes()) 38 39 40class FieldSpec(TypedDict): 41 description: str 42 type: str 43 position: int 44 length: int 45 46 47class RecordSpec(TypedDict): 48 indicator: str 49 description: str 50 fields: list[FieldSpec] 51 52 53'''A latitude in degrees, minutes, seconds format. 54 55This is equivalent to the "G2" type in the IHS format spec. 56''' 57class LatitudeDMS(NamedTuple): 58 degrees: int 59 minutes: int 60 seconds: int 61 62 63'''A longitude in degrees, minutes, seconds format. 64 65This is equivalent to the "G3" type in the IHS format spec. 66''' 67class LongitudeDMS(NamedTuple): 68 degrees: int 69 minutes: int 70 seconds: int 71 72 73'''A single record (i.e. line) from an IHS 29x file.''' 74class Record(NamedTuple): 75 indicator: str 76 '''The record type indicator field's value. 77 78 In other words, the first few characters of the line, which indicate what 79 type of record the line represents. 80 ''' 81 82 type: str 83 '''The record type, as described by the IHS spec.''' 84 85 contents: dict[ 86 str, 87 str | float | int | datetime | LatitudeDMS | LongitudeDMS | None 88 ] 89 '''The record's fields, converted to appropriate types.''' 90 91 92class IndicatorTrie(NamedTuple): 93 character: str 94 record: list[RecordSpec] # used as mutable cell for 0 or 1 records 95 suffixes: dict[str, 'IndicatorTrie'] 96 97 def insert(self, indicator: str, record: RecordSpec) -> None: 98 if indicator == '': 99 self.record.clear() 100 self.record.append(record) 101 return 102 next_char = indicator[0] 103 if next_char not in self.suffixes: 104 self.suffixes[next_char] = IndicatorTrie(next_char, [], {}) 105 self.suffixes[next_char].insert(indicator[1:], record) 106 107 108 def longest_match(self, indicator: str) -> RecordSpec: 109 # NASTY HACK for .9xc formats, where the indicator is sometimes quoted 110 while indicator and indicator[0] == '"': 111 indicator = indicator[1:] 112 if indicator and indicator[0] in self.suffixes: 113 return self.suffixes[indicator[0]].longest_match(indicator[1:]) 114 if self.record: 115 return self.record[0] 116 raise KeyError(indicator) 117 118 119FieldParser: TypeAlias = Callable[[list[FieldSpec], str, bool], dict[str, Any]] 120 121 122def spec_to_indicator_trie(spec: list[RecordSpec]) -> IndicatorTrie: 123 t = IndicatorTrie('', [], {}) 124 for r in spec: 125 t.insert(r['indicator'], r) 126 return t 127 128 129SPEC297_TRIE = spec_to_indicator_trie(SPEC297['records']) 130SPEC298_TRIE = spec_to_indicator_trie(SPEC298['records']) 131 132 133def convert_field(ty: str, val: str) -> Any: 134 match ty: 135 case 'Alphanumeric': 136 return val 137 case 'Numeric': 138 return float(val) if val != '' else None 139 case 'YearYYYY': 140 return int(val) if val != '' else None 141 case 'MonthYYYYMM': 142 return datetime.strptime(val, '%Y%m') if val != '' else None 143 case 'DateYYYYSlashMMSlashDD': 144 return datetime.strptime(val, '%Y/%m/%d') if val != '' else None 145 case 'DateYYYYMMDD': 146 return datetime.strptime(val, '%Y%m%d') if val != '' else None 147 case 'G2': 148 return ( 149 LatitudeDMS(*[int(part) for part in val.split('.')]) 150 if val != '' 151 else None 152 ) 153 case 'G3': 154 return ( 155 LongitudeDMS(*[int(part) for part in val.split('.')]) 156 if val != '' 157 else None 158 ) 159 case _: 160 if val != ty: 161 raise ValueError(f'expected literal "{ty}", found "{val}"') 162 return val 163 164 165def comma_split(line: str) -> list[str]: 166 field: list[str] = [] 167 fields = [] 168 in_quote = False 169 quote_in_quote = False 170 for c in line: 171 if not in_quote: 172 if c == ',': 173 fields.append(''.join(field)) 174 field = [] 175 elif c == '"': 176 in_quote = True 177 else: 178 field.append(c) 179 elif not quote_in_quote: 180 if c == '"': 181 quote_in_quote = True 182 else: 183 field.append(c) 184 else: 185 if c == '"': 186 field.append(c) # escaped quote 187 else: 188 in_quote = False 189 if c == ',': 190 fields.append(''.join(field)) 191 field = [] 192 else: # weird case... 193 field.append(c) 194 quote_in_quote = False 195 if line: 196 fields.append(''.join(field)) 197 return fields 198 199 200def parse_fields_fixed(fields: list[FieldSpec], line: str, strict: bool = False 201 ) -> dict[str, Any]: 202 result = {} 203 for f in fields: 204 ix = f['position'] - 1 205 ty = f['type'] 206 val = line[ix:(ix + f['length'])].rstrip() 207 if strict: 208 conv = convert_field(ty, val) 209 else: 210 try: 211 conv = convert_field(ty, val) 212 except ValueError: 213 warnings.warn(f'failed to convert "{val}" to type "{ty}"') 214 conv = val 215 result[f['description']] = conv 216 return result 217 218 219def parse_fields_comma(fields: list[FieldSpec], line: str, strict: bool = False 220 ) -> dict[str, Any]: 221 result = {} 222 vals = [w.rstrip() for w in comma_split(line)] 223 if len(vals) > len(fields): 224 raise ValueError('Invalid row length') 225 while len(vals) < len(fields): 226 # NASTY HACK for .9xc format: right-pad with blanks to match "Blank" 227 # fields in spec 228 vals.append('') 229 for f, val in zip(fields, vals): 230 ty = f['type'] 231 if strict: 232 conv = convert_field(ty, val) 233 else: 234 try: 235 conv = convert_field(ty, val) 236 except ValueError: 237 warnings.warn(f'failed to convert "{val}" to type "{ty}"') 238 conv = val 239 result[f['description']] = conv 240 return result 241 242 243def parse_header(line: str) -> tuple[IndicatorTrie, FieldParser, dict[str, Any]]: 244 if 'US WELL DATA' in line: 245 hdr_fields = SPEC297['header'] 246 trie = SPEC297_TRIE 247 fmt = '297' 248 elif 'US PRODUCTION DATA' in line: 249 hdr_fields = SPEC298['header'] 250 trie = SPEC298_TRIE 251 fmt = '298' 252 else: 253 raise ValueError(f'invalid header line: "{line}"') 254 255 hdr = parse_fields_fixed(hdr_fields, line) 256 257 if hdr['Download Format'] != fmt: 258 raise ValueError('unexpected download format') 259 if hdr['Version'] != '1.1': 260 raise ValueError('unexpected format version') 261 262 if hdr['Delimiter'] == 'FIXED': 263 field_parser = parse_fields_fixed 264 elif hdr['Delimiter'] == 'COMMA': 265 field_parser = parse_fields_comma 266 else: 267 raise ValueError('expected FIXED or COMMA format') 268 269 return trie, field_parser, hdr 270 271 272def parse_record_spec(line: str, trie: IndicatorTrie) -> RecordSpec: 273 try: 274 return trie.longest_match(line) 275 except KeyError: 276 raise ValueError(f'Unknown record type for {line}') 277 278 279def parse_record(line: str, spec: RecordSpec, field_parser: FieldParser, 280 strict: bool = False) -> Record: 281 return Record(spec['indicator'], spec['description'], 282 field_parser(spec['fields'], line, strict)) 283 284 285def stream_records(src: TextIO, strict: bool = False, 286 indicators: set[str] | None = None) -> Iterator[Record]: 287 '''Stream records from an IHS 29x file. 288 289 File type (297/well header or 298/well production) and format (fixed or 290 comma-delimited) will be automatically determined. 291 292 In accordance with the spec, files may contain multiple types and formats, 293 each with their own header. 294 Headers will be streamed as `Record` objects whose `indicator` is either 295 `'US WELL DATA'` or `'US PRODUCTION DATA'` (for 297 or 298 formats 296 respectively) and whose `type` is `'File Header'`. 297 298 Parameters: 299 - `src`: a file or file-like object open in text mode 300 301 - `strict` (default `False`): if `True`, raise a `ValueError` when 302 encountering a field which cannot be converted to the appropriate type 303 as indicated by the spec; if `False`, these fields will be returned as 304 `str` values, but a warning will be issued for each one encountered 305 306 - `indicators` (default `None`): if provided, a set of indicator field 307 values (e.g. `'A'` for "General Information" records from a 297 file) 308 corresponding to rows which should be included in the output (rows 309 with indicators not in this set will be skipped, except that per-well 310 or per-entity start and end records [e.g. `'START_US_WELL'`] are 311 always included); if `None`, all rows will be processed 312 313 Returns an iterator over (selected) records from the file. 314 ''' 315 316 if indicators is not None: 317 indicators |= { 318 'START_US_WELL', 'END_US_WELL', 'START_US_PROD', 'END_US_PROD' 319 } 320 321 seen_start = False 322 count = 0 323 while line := src.readline(): 324 trie, field_parser, hdr = parse_header(line) 325 count = int(hdr['Entity Count']) 326 yield Record(hdr['Data Type'], 'File Header', hdr) 327 328 while line := src.readline(): 329 rec_spec = parse_record_spec(line, trie) 330 if indicators is not None and rec_spec['indicator'] not in indicators: 331 continue 332 rec = parse_record(line, rec_spec, field_parser, strict) 333 if rec.type == 'Start Record Label': 334 if seen_start: 335 raise ValueError('start without end') 336 else: 337 seen_start = True 338 if rec.type == 'End Record Label': 339 if seen_start: 340 seen_start = False 341 count -= 1 342 if count == 0: 343 yield rec 344 break # try parsing as header 345 else: 346 raise ValueError('end without start') 347 yield rec 348 349 if count > 0: 350 warnings.warn(f'{count} records still expected at end of file') 351 352__all__ = [ 353 'LatitudeDMS', 354 'LongitudeDMS', 355 'Record', 356 'stream_records', 357]
LatitudeDMS(degrees, minutes, seconds)
Create new instance of LatitudeDMS(degrees, minutes, seconds)
Inherited Members
- builtins.tuple
- index
- count
LongitudeDMS(degrees, minutes, seconds)
Create new instance of LongitudeDMS(degrees, minutes, seconds)
Inherited Members
- builtins.tuple
- index
- count
75class Record(NamedTuple): 76 indicator: str 77 '''The record type indicator field's value. 78 79 In other words, the first few characters of the line, which indicate what 80 type of record the line represents. 81 ''' 82 83 type: str 84 '''The record type, as described by the IHS spec.''' 85 86 contents: dict[ 87 str, 88 str | float | int | datetime | LatitudeDMS | LongitudeDMS | None 89 ] 90 '''The record's fields, converted to appropriate types.'''
Record(indicator, type, contents)
Create new instance of Record(indicator, type, contents)
The record type indicator field's value.
In other words, the first few characters of the line, which indicate what type of record the line represents.
The record's fields, converted to appropriate types.
Inherited Members
- builtins.tuple
- index
- count
286def stream_records(src: TextIO, strict: bool = False, 287 indicators: set[str] | None = None) -> Iterator[Record]: 288 '''Stream records from an IHS 29x file. 289 290 File type (297/well header or 298/well production) and format (fixed or 291 comma-delimited) will be automatically determined. 292 293 In accordance with the spec, files may contain multiple types and formats, 294 each with their own header. 295 Headers will be streamed as `Record` objects whose `indicator` is either 296 `'US WELL DATA'` or `'US PRODUCTION DATA'` (for 297 or 298 formats 297 respectively) and whose `type` is `'File Header'`. 298 299 Parameters: 300 - `src`: a file or file-like object open in text mode 301 302 - `strict` (default `False`): if `True`, raise a `ValueError` when 303 encountering a field which cannot be converted to the appropriate type 304 as indicated by the spec; if `False`, these fields will be returned as 305 `str` values, but a warning will be issued for each one encountered 306 307 - `indicators` (default `None`): if provided, a set of indicator field 308 values (e.g. `'A'` for "General Information" records from a 297 file) 309 corresponding to rows which should be included in the output (rows 310 with indicators not in this set will be skipped, except that per-well 311 or per-entity start and end records [e.g. `'START_US_WELL'`] are 312 always included); if `None`, all rows will be processed 313 314 Returns an iterator over (selected) records from the file. 315 ''' 316 317 if indicators is not None: 318 indicators |= { 319 'START_US_WELL', 'END_US_WELL', 'START_US_PROD', 'END_US_PROD' 320 } 321 322 seen_start = False 323 count = 0 324 while line := src.readline(): 325 trie, field_parser, hdr = parse_header(line) 326 count = int(hdr['Entity Count']) 327 yield Record(hdr['Data Type'], 'File Header', hdr) 328 329 while line := src.readline(): 330 rec_spec = parse_record_spec(line, trie) 331 if indicators is not None and rec_spec['indicator'] not in indicators: 332 continue 333 rec = parse_record(line, rec_spec, field_parser, strict) 334 if rec.type == 'Start Record Label': 335 if seen_start: 336 raise ValueError('start without end') 337 else: 338 seen_start = True 339 if rec.type == 'End Record Label': 340 if seen_start: 341 seen_start = False 342 count -= 1 343 if count == 0: 344 yield rec 345 break # try parsing as header 346 else: 347 raise ValueError('end without start') 348 yield rec 349 350 if count > 0: 351 warnings.warn(f'{count} records still expected at end of file')
Stream records from an IHS 29x file.
File type (297/well header or 298/well production) and format (fixed or comma-delimited) will be automatically determined.
In accordance with the spec, files may contain multiple types and formats,
each with their own header.
Headers will be streamed as Record
objects whose indicator
is either
'US WELL DATA'
or 'US PRODUCTION DATA'
(for 297 or 298 formats
respectively) and whose type
is 'File Header'
.
Parameters:
src
: a file or file-like object open in text mode
strict
(defaultFalse
): ifTrue
, raise aValueError
when encountering a field which cannot be converted to the appropriate type as indicated by the spec; ifFalse
, these fields will be returned asstr
values, but a warning will be issued for each one encounteredindicators
(defaultNone
): if provided, a set of indicator field values (e.g.'A'
for "General Information" records from a 297 file) corresponding to rows which should be included in the output (rows with indicators not in this set will be skipped, except that per-well or per-entity start and end records [e.g.'START_US_WELL'
] are always included); ifNone
, all rows will be processed
Returns an iterator over (selected) records from the file.