ihs29x

  1# (c) 2023 terminus, LLC
  2# Permission is hereby granted, free of charge, to any person obtaining a copy
  3# of this software and associated documentation files (the "Software"), to deal
  4# in the Software without restriction, including without limitation the rights
  5# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  6# copies of the Software, and to permit persons to whom the Software is
  7# furnished to do so, subject to the following conditions:
  8# 
  9# The above copyright notice and this permission notice shall be included in all
 10# copies or substantial portions of the Software.
 11# 
 12# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 13# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 14# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 15# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 16# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 17# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 18# SOFTWARE.
 19
 20import json
 21import warnings
 22from datetime import date, datetime
 23
 24import importlib_resources as ir
 25
 26from typing import (
 27    Any, Callable, Iterator, NamedTuple, TextIO, TypeAlias, TypedDict
 28)
 29
 30
 31__version__ = '0.1.0'
 32
 33
 34SPEC297 = json.loads(
 35  ir.files('ihs29x').joinpath('TwoNinetySeven.json').read_bytes())
 36SPEC298 = json.loads(
 37  ir.files('ihs29x').joinpath('TwoNinetyEight.json').read_bytes())
 38
 39
 40class FieldSpec(TypedDict):
 41    description: str
 42    type: str
 43    position: int
 44    length: int
 45
 46
 47class RecordSpec(TypedDict):
 48    indicator: str
 49    description: str
 50    fields: list[FieldSpec]
 51
 52
 53'''A latitude in degrees, minutes, seconds format.
 54
 55This is equivalent to the "G2" type in the IHS format spec.
 56'''
 57class LatitudeDMS(NamedTuple):
 58    degrees: int
 59    minutes: int
 60    seconds: int
 61
 62
 63'''A longitude in degrees, minutes, seconds format.
 64
 65This is equivalent to the "G3" type in the IHS format spec.
 66'''
 67class LongitudeDMS(NamedTuple):
 68    degrees: int
 69    minutes: int
 70    seconds: int
 71
 72
 73'''A single record (i.e. line) from an IHS 29x file.'''
 74class Record(NamedTuple):
 75    indicator: str
 76    '''The record type indicator field's value.
 77
 78    In other words, the first few characters of the line, which indicate what
 79    type of record the line represents.
 80    '''
 81
 82    type: str
 83    '''The record type, as described by the IHS spec.'''
 84
 85    contents: dict[
 86      str,
 87      str | float | int | datetime | LatitudeDMS | LongitudeDMS | None
 88    ]
 89    '''The record's fields, converted to appropriate types.'''
 90
 91
 92class IndicatorTrie(NamedTuple):
 93    character: str
 94    record: list[RecordSpec] # used as mutable cell for 0 or 1 records
 95    suffixes: dict[str, 'IndicatorTrie']
 96
 97    def insert(self, indicator: str, record: RecordSpec) -> None:
 98        if indicator == '':
 99            self.record.clear()
100            self.record.append(record)
101            return
102        next_char = indicator[0]
103        if next_char not in self.suffixes:
104            self.suffixes[next_char] = IndicatorTrie(next_char, [], {})
105        self.suffixes[next_char].insert(indicator[1:], record)
106
107
108    def longest_match(self, indicator: str) -> RecordSpec:
109        # NASTY HACK for .9xc formats, where the indicator is sometimes quoted
110        while indicator and indicator[0] == '"':
111            indicator = indicator[1:]
112        if indicator and indicator[0] in self.suffixes:
113            return self.suffixes[indicator[0]].longest_match(indicator[1:])
114        if self.record:
115            return self.record[0]
116        raise KeyError(indicator)
117
118
119FieldParser: TypeAlias = Callable[[list[FieldSpec], str, bool], dict[str, Any]]
120
121
122def spec_to_indicator_trie(spec: list[RecordSpec]) -> IndicatorTrie:
123    t = IndicatorTrie('', [], {})
124    for r in spec:
125        t.insert(r['indicator'], r)
126    return t
127
128
129SPEC297_TRIE = spec_to_indicator_trie(SPEC297['records'])
130SPEC298_TRIE = spec_to_indicator_trie(SPEC298['records'])
131
132
133def convert_field(ty: str, val: str) -> Any:
134    match ty:
135        case 'Alphanumeric':
136            return val
137        case 'Numeric':
138            return float(val) if val != '' else None
139        case 'YearYYYY':
140            return int(val) if val != '' else None
141        case 'MonthYYYYMM':
142            return datetime.strptime(val, '%Y%m') if val != '' else None
143        case 'DateYYYYSlashMMSlashDD':
144            return datetime.strptime(val, '%Y/%m/%d') if val != '' else None
145        case 'DateYYYYMMDD':
146            return datetime.strptime(val, '%Y%m%d') if val != '' else None
147        case 'G2':
148            return (
149                LatitudeDMS(*[int(part) for part in val.split('.')])
150                if val != ''
151                else None
152            )
153        case 'G3':
154            return (
155                LongitudeDMS(*[int(part) for part in val.split('.')])
156                if val != ''
157                else None
158            )
159        case _:
160            if val != ty:
161                raise ValueError(f'expected literal "{ty}", found "{val}"')
162            return val
163
164
165def comma_split(line: str) -> list[str]:
166    field: list[str] = []
167    fields = []
168    in_quote = False
169    quote_in_quote = False
170    for c in line:
171        if not in_quote:
172            if c == ',':
173                fields.append(''.join(field))
174                field = []
175            elif c == '"':
176                in_quote = True
177            else:
178                field.append(c)
179        elif not quote_in_quote:
180            if c == '"':
181                quote_in_quote = True
182            else:
183                field.append(c)
184        else:
185            if c == '"':
186                field.append(c) # escaped quote
187            else:
188                in_quote = False
189                if c == ',':
190                    fields.append(''.join(field))
191                    field = []
192                else: # weird case...
193                    field.append(c)
194            quote_in_quote = False
195    if line:
196        fields.append(''.join(field))
197    return fields
198
199
200def parse_fields_fixed(fields: list[FieldSpec], line: str, strict: bool = False
201  ) -> dict[str, Any]:
202    result = {}
203    for f in fields:
204        ix = f['position'] - 1
205        ty = f['type']
206        val = line[ix:(ix + f['length'])].rstrip()
207        if strict:
208            conv = convert_field(ty, val)
209        else:
210            try:
211                conv = convert_field(ty, val)
212            except ValueError:
213                warnings.warn(f'failed to convert "{val}" to type "{ty}"')
214                conv = val
215        result[f['description']] = conv
216    return result
217
218
219def parse_fields_comma(fields: list[FieldSpec], line: str, strict: bool = False
220  ) -> dict[str, Any]:
221    result = {}
222    vals = [w.rstrip() for w in comma_split(line)]
223    if len(vals) > len(fields):
224        raise ValueError('Invalid row length')
225    while len(vals) < len(fields):
226        # NASTY HACK for .9xc format: right-pad with blanks to match "Blank"
227        # fields in spec
228        vals.append('')
229    for f, val in zip(fields, vals):
230        ty = f['type']
231        if strict:
232            conv = convert_field(ty, val)
233        else:
234            try:
235                conv = convert_field(ty, val)
236            except ValueError:
237                warnings.warn(f'failed to convert "{val}" to type "{ty}"')
238                conv = val
239        result[f['description']] = conv
240    return result
241
242
243def parse_header(line: str) -> tuple[IndicatorTrie, FieldParser, dict[str, Any]]:
244    if 'US WELL DATA' in line:
245        hdr_fields = SPEC297['header']
246        trie = SPEC297_TRIE
247        fmt = '297'
248    elif 'US PRODUCTION DATA' in line:
249        hdr_fields = SPEC298['header']
250        trie = SPEC298_TRIE
251        fmt = '298'
252    else:
253        raise ValueError(f'invalid header line: "{line}"')
254
255    hdr = parse_fields_fixed(hdr_fields, line)
256
257    if hdr['Download Format'] != fmt:
258        raise ValueError('unexpected download format')
259    if hdr['Version'] != '1.1':
260        raise ValueError('unexpected format version')
261
262    if hdr['Delimiter'] == 'FIXED':
263        field_parser = parse_fields_fixed
264    elif hdr['Delimiter'] == 'COMMA':
265        field_parser = parse_fields_comma
266    else:
267        raise ValueError('expected FIXED or COMMA format')
268
269    return trie, field_parser, hdr
270
271
272def parse_record_spec(line: str, trie: IndicatorTrie) -> RecordSpec:
273    try:
274        return trie.longest_match(line)
275    except KeyError:
276        raise ValueError(f'Unknown record type for {line}')
277
278
279def parse_record(line: str, spec: RecordSpec, field_parser: FieldParser,
280  strict: bool = False) -> Record:
281    return Record(spec['indicator'], spec['description'],
282      field_parser(spec['fields'], line, strict))
283
284
285def stream_records(src: TextIO, strict: bool = False,
286  indicators: set[str] | None = None) -> Iterator[Record]:
287    '''Stream records from an IHS 29x file.
288
289    File type (297/well header or 298/well production) and format (fixed or
290    comma-delimited) will be automatically determined.
291
292    In accordance with the spec, files may contain multiple types and formats,
293    each with their own header.
294    Headers will be streamed as `Record` objects whose `indicator` is either
295    `'US WELL DATA'` or `'US PRODUCTION DATA'` (for 297 or 298 formats
296    respectively) and whose `type` is `'File Header'`.
297
298    Parameters:
299      - `src`: a file or file-like object open in text mode
300
301      - `strict` (default `False`): if `True`, raise a `ValueError` when
302          encountering a field which cannot be converted to the appropriate type
303          as indicated by the spec; if `False`, these fields will be returned as
304          `str` values, but a warning will be issued for each one encountered
305
306      - `indicators` (default `None`): if provided, a set of indicator field
307          values (e.g. `'A'` for "General Information" records from a 297 file)
308          corresponding to rows which should be included in the output (rows
309          with indicators not in this set will be skipped, except that per-well
310          or per-entity start and end records [e.g. `'START_US_WELL'`] are
311          always included); if `None`, all rows will be processed
312
313    Returns an iterator over (selected) records from the file.
314    '''
315
316    if indicators is not None:
317        indicators |= {
318            'START_US_WELL', 'END_US_WELL', 'START_US_PROD', 'END_US_PROD'
319        }
320
321    seen_start = False
322    count = 0
323    while line := src.readline():
324        trie, field_parser, hdr = parse_header(line)
325        count = int(hdr['Entity Count'])
326        yield Record(hdr['Data Type'], 'File Header', hdr)
327
328        while line := src.readline():
329            rec_spec = parse_record_spec(line, trie)
330            if indicators is not None and rec_spec['indicator'] not in indicators:
331                continue
332            rec = parse_record(line, rec_spec, field_parser, strict)
333            if rec.type == 'Start Record Label':
334                if seen_start:
335                    raise ValueError('start without end')
336                else:
337                    seen_start = True
338            if rec.type == 'End Record Label':
339                if seen_start:
340                    seen_start = False
341                    count -= 1
342                    if count == 0:
343                        yield rec
344                        break # try parsing as header
345                else:
346                    raise ValueError('end without start')
347            yield rec
348
349    if count > 0:
350        warnings.warn(f'{count} records still expected at end of file')
351
352__all__ = [
353    'LatitudeDMS',
354    'LongitudeDMS',
355    'Record',
356    'stream_records',
357]
class LatitudeDMS(typing.NamedTuple):
58class LatitudeDMS(NamedTuple):
59    degrees: int
60    minutes: int
61    seconds: int

LatitudeDMS(degrees, minutes, seconds)

LatitudeDMS(degrees: int, minutes: int, seconds: int)

Create new instance of LatitudeDMS(degrees, minutes, seconds)

degrees: int

Alias for field number 0

minutes: int

Alias for field number 1

seconds: int

Alias for field number 2

Inherited Members
builtins.tuple
index
count
class LongitudeDMS(typing.NamedTuple):
68class LongitudeDMS(NamedTuple):
69    degrees: int
70    minutes: int
71    seconds: int

LongitudeDMS(degrees, minutes, seconds)

LongitudeDMS(degrees: int, minutes: int, seconds: int)

Create new instance of LongitudeDMS(degrees, minutes, seconds)

degrees: int

Alias for field number 0

minutes: int

Alias for field number 1

seconds: int

Alias for field number 2

Inherited Members
builtins.tuple
index
count
class Record(typing.NamedTuple):
75class Record(NamedTuple):
76    indicator: str
77    '''The record type indicator field's value.
78
79    In other words, the first few characters of the line, which indicate what
80    type of record the line represents.
81    '''
82
83    type: str
84    '''The record type, as described by the IHS spec.'''
85
86    contents: dict[
87      str,
88      str | float | int | datetime | LatitudeDMS | LongitudeDMS | None
89    ]
90    '''The record's fields, converted to appropriate types.'''

Record(indicator, type, contents)

Record( indicator: str, type: str, contents: dict[str, str | float | int | datetime.datetime | ihs29x.LatitudeDMS | ihs29x.LongitudeDMS | None])

Create new instance of Record(indicator, type, contents)

indicator: str

The record type indicator field's value.

In other words, the first few characters of the line, which indicate what type of record the line represents.

type: str

The record type, as described by the IHS spec.

contents: dict[str, str | float | int | datetime.datetime | ihs29x.LatitudeDMS | ihs29x.LongitudeDMS | None]

The record's fields, converted to appropriate types.

Inherited Members
builtins.tuple
index
count
def stream_records( src: <class 'TextIO'>, strict: bool = False, indicators: set[str] | None = None) -> Iterator[ihs29x.Record]:
286def stream_records(src: TextIO, strict: bool = False,
287  indicators: set[str] | None = None) -> Iterator[Record]:
288    '''Stream records from an IHS 29x file.
289
290    File type (297/well header or 298/well production) and format (fixed or
291    comma-delimited) will be automatically determined.
292
293    In accordance with the spec, files may contain multiple types and formats,
294    each with their own header.
295    Headers will be streamed as `Record` objects whose `indicator` is either
296    `'US WELL DATA'` or `'US PRODUCTION DATA'` (for 297 or 298 formats
297    respectively) and whose `type` is `'File Header'`.
298
299    Parameters:
300      - `src`: a file or file-like object open in text mode
301
302      - `strict` (default `False`): if `True`, raise a `ValueError` when
303          encountering a field which cannot be converted to the appropriate type
304          as indicated by the spec; if `False`, these fields will be returned as
305          `str` values, but a warning will be issued for each one encountered
306
307      - `indicators` (default `None`): if provided, a set of indicator field
308          values (e.g. `'A'` for "General Information" records from a 297 file)
309          corresponding to rows which should be included in the output (rows
310          with indicators not in this set will be skipped, except that per-well
311          or per-entity start and end records [e.g. `'START_US_WELL'`] are
312          always included); if `None`, all rows will be processed
313
314    Returns an iterator over (selected) records from the file.
315    '''
316
317    if indicators is not None:
318        indicators |= {
319            'START_US_WELL', 'END_US_WELL', 'START_US_PROD', 'END_US_PROD'
320        }
321
322    seen_start = False
323    count = 0
324    while line := src.readline():
325        trie, field_parser, hdr = parse_header(line)
326        count = int(hdr['Entity Count'])
327        yield Record(hdr['Data Type'], 'File Header', hdr)
328
329        while line := src.readline():
330            rec_spec = parse_record_spec(line, trie)
331            if indicators is not None and rec_spec['indicator'] not in indicators:
332                continue
333            rec = parse_record(line, rec_spec, field_parser, strict)
334            if rec.type == 'Start Record Label':
335                if seen_start:
336                    raise ValueError('start without end')
337                else:
338                    seen_start = True
339            if rec.type == 'End Record Label':
340                if seen_start:
341                    seen_start = False
342                    count -= 1
343                    if count == 0:
344                        yield rec
345                        break # try parsing as header
346                else:
347                    raise ValueError('end without start')
348            yield rec
349
350    if count > 0:
351        warnings.warn(f'{count} records still expected at end of file')

Stream records from an IHS 29x file.

File type (297/well header or 298/well production) and format (fixed or comma-delimited) will be automatically determined.

In accordance with the spec, files may contain multiple types and formats, each with their own header. Headers will be streamed as Record objects whose indicator is either 'US WELL DATA' or 'US PRODUCTION DATA' (for 297 or 298 formats respectively) and whose type is 'File Header'.

Parameters:

  • src: a file or file-like object open in text mode
  • strict (default False): if True, raise a ValueError when encountering a field which cannot be converted to the appropriate type as indicated by the spec; if False, these fields will be returned as str values, but a warning will be issued for each one encountered

  • indicators (default None): if provided, a set of indicator field values (e.g. 'A' for "General Information" records from a 297 file) corresponding to rows which should be included in the output (rows with indicators not in this set will be skipped, except that per-well or per-entity start and end records [e.g. 'START_US_WELL'] are always included); if None, all rows will be processed

Returns an iterator over (selected) records from the file.