Skip to content

Commit 24e4f37

Browse files
author
Nolan Woods
committed
Clean up json/txt/yaml output and add more tests
1 parent da670ce commit 24e4f37

6 files changed

Lines changed: 449769 additions & 115 deletions

File tree

biopython_convert/__init__.py

Lines changed: 110 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
import sys
77
import pathlib
88
import itertools
9+
import types
910
from collections import defaultdict
1011

1112
import getopt
12-
from typing import Callable, Generator
13+
from typing import Callable, Generator, OrderedDict
1314

14-
from Bio import SeqIO, StreamModeError
15+
from Bio import SeqIO, StreamModeError, SeqFeature
1516
import gffutils
1617
from gffutils import biopython_integration
1718

@@ -117,6 +118,18 @@ def to_stats(record: SeqIO.SeqRecord) -> str:
117118
return str(gffutils.Feature(record.id, "biopython.convert", "sequence", start=1, end=len(record), attributes=attributes))
118119

119120

121+
def _allow_single(records):
122+
"""
123+
Helper to allow returing single record from JMESPath
124+
:param records: SeqIO.SeqRecord instance
125+
:return: tuple containing SeqIO.SeqRecord
126+
"""
127+
if isinstance(records, SeqIO.SeqRecord):
128+
# Support returning single record from JMESPath
129+
return (records,)
130+
return records
131+
132+
120133
def _to_SeqRecord(records):
121134
"""
122135
Helper to convert all output records to SeqRecords
@@ -127,13 +140,9 @@ def _to_SeqRecord(records):
127140
# Support generating a single new record in JMESPath
128141
records = SeqIO.SeqRecord(**records)
129142

130-
if isinstance(records, SeqIO.SeqRecord):
131-
# Support returning single record from JMESPath
132-
records = (records,)
133-
134-
records = map(lambda r: SeqIO.SeqRecord(**r) if isinstance(records, dict) else r, records)
143+
records = _allow_single(records)
135144

136-
return records
145+
return map(lambda r: SeqIO.SeqRecord(**r) if isinstance(records, dict) else r, records)
137146

138147

139148
def get_records(input_handle, input_type: str, jpath: str = '', xform: Callable = _to_SeqRecord):
@@ -170,7 +179,7 @@ def gentype(x):
170179
if jpath:
171180
input_records = JMESPathGen.search(jpath, gentype(input_records))
172181

173-
# Apply xform to both entire return value and each returned element
182+
# Apply xform to both entire return value
174183
input_records = xform(input_records)
175184

176185
return input_records
@@ -218,21 +227,107 @@ def _print_stats(record, stats):
218227
return record
219228

220229

221-
def convert(input_path, input_type, output_path, output_type, split=None, jpath='', stats=None):
230+
def to_strings(v):
231+
"""
232+
Helper to recursively convert Generators to lists, stringifing all else
233+
:param v: Parent object/list
234+
:return: list/dict with all children converted to the same or a string
235+
"""
236+
if isinstance(v, str):
237+
return v
238+
239+
if isinstance(v, (types.GeneratorType, map, filter, tuple)):
240+
v = list(v)
241+
242+
if hasattr(v, 'keys'):
243+
keys = v.keys()
244+
elif hasattr(v, '__getitem__'):
245+
keys = range(len(v))
246+
else:
247+
return str(v)
248+
249+
for i in keys:
250+
v[i] = to_strings(v[i])
251+
return v
252+
253+
254+
def to_dicts(v):
255+
"""
256+
Helper to recursively convert Objects and Generators to dicts and lists
257+
:param v: Parent object/list
258+
:return: list/dict with all children converted to the same
259+
"""
260+
if isinstance(v, str):
261+
try:
262+
return int(v)
263+
except ValueError:
264+
pass
265+
return v
266+
267+
if isinstance(v, (types.GeneratorType, map, filter, tuple)):
268+
v = list(v)
269+
270+
if isinstance(v, SeqIO.SeqRecord):
271+
v = {
272+
**v.__dict__,
273+
'seq': str(v.seq)
274+
}
275+
del v['_seq']
276+
elif isinstance(v, SeqFeature.FeatureLocation):
277+
v = {
278+
**v.__dict__,
279+
'start': v.start,
280+
'end': v.end,
281+
'strand': v.strand,
282+
}
283+
del v['_start']
284+
del v['_end']
285+
del v['_strand']
286+
elif isinstance(v, SeqFeature.AbstractPosition):
287+
return to_dicts(str(v))
288+
elif isinstance(v, OrderedDict):
289+
v = dict(v)
290+
elif hasattr(v, '__dict__'):
291+
v = v.__dict__
292+
293+
if hasattr(v, 'keys'):
294+
keys = v.keys()
295+
elif hasattr(v, '__getitem__'):
296+
keys = range(len(v))
297+
else:
298+
return v
299+
300+
for i in keys:
301+
v[i] = to_dicts(v[i])
302+
return v
303+
304+
305+
def convert(input_path: pathlib.Path, input_type: str, output_path: pathlib.Path, output_type: str, split: bool = False, jpath: str = '', stats=None):
306+
"""
307+
Convert document from one format to another, optionally querying via JMESPath or splitting into separate outputs
308+
:param input_path: Path to input dataset
309+
:param input_type: Format of input dataset
310+
:param output_path: Path to output dataset
311+
:param output_type: Format of output dataset
312+
:param split: Split each record into a different output dataset. Adds index suffix to output path.
313+
:param jpath: JMESPath query to apply to input dataset before outputting
314+
:param stats: File handle to output GFF3 summary of output records
315+
:return: None
316+
"""
222317
xform = _to_SeqRecord
223318
with input_path.open("r") as handle:
224319
if output_type == 'text':
225-
writer = lambda r, fh, t: fh.write("\n".join(map(str, r)) + "\n")
320+
writer = lambda records, fh, t: fh.write("\n".join(map(str, to_strings(records))) + "\n")
226321
xform = lambda x: x
227322
elif output_type == 'json':
228323
import json
229-
writer = lambda r, fh, t: json.dump(tuple(r), fh, skipkeys=True)
230-
xform = lambda x: x
324+
writer = lambda records, fh, t: json.dump(to_dicts(records), fh, skipkeys=True, indent=True)
325+
xform = _allow_single
231326
elif output_type in ('yml', 'yaml'):
232327
from ruamel.yaml import YAML
233328
yml = YAML(typ='unsafe')
234-
writer = lambda r, fh, t: yml.dump(tuple(r), fh)
235-
xform = lambda x: x
329+
writer = lambda records, fh, t: yml.dump(to_dicts(records), fh)
330+
xform = _allow_single
236331
elif output_type in gff_types:
237332
writer = gff_writer
238333
else:

0 commit comments

Comments
 (0)