66import sys
77import pathlib
88import itertools
9+ import types
910from collections import defaultdict
1011
1112import getopt
12- from typing import Callable , Generator
13+ from typing import Callable , Generator , OrderedDict
1314
14- from Bio import SeqIO , StreamModeError
15+ from Bio import SeqIO , StreamModeError , SeqFeature
1516import gffutils
1617from gffutils import biopython_integration
1718
@@ -117,6 +118,18 @@ def to_stats(record: SeqIO.SeqRecord) -> str:
117118 return str (gffutils .Feature (record .id , "biopython.convert" , "sequence" , start = 1 , end = len (record ), attributes = attributes ))
118119
119120
121+ def _allow_single (records ):
122+ """
123+ Helper to allow returing single record from JMESPath
124+ :param records: SeqIO.SeqRecord instance
125+ :return: tuple containing SeqIO.SeqRecord
126+ """
127+ if isinstance (records , SeqIO .SeqRecord ):
128+ # Support returning single record from JMESPath
129+ return (records ,)
130+ return records
131+
132+
120133def _to_SeqRecord (records ):
121134 """
122135 Helper to convert all output records to SeqRecords
@@ -127,13 +140,9 @@ def _to_SeqRecord(records):
127140 # Support generating a single new record in JMESPath
128141 records = SeqIO .SeqRecord (** records )
129142
130- if isinstance (records , SeqIO .SeqRecord ):
131- # Support returning single record from JMESPath
132- records = (records ,)
133-
134- records = map (lambda r : SeqIO .SeqRecord (** r ) if isinstance (records , dict ) else r , records )
143+ records = _allow_single (records )
135144
136- return records
145+ return map ( lambda r : SeqIO . SeqRecord ( ** r ) if isinstance ( records , dict ) else r , records )
137146
138147
139148def get_records (input_handle , input_type : str , jpath : str = '' , xform : Callable = _to_SeqRecord ):
@@ -170,7 +179,7 @@ def gentype(x):
170179 if jpath :
171180 input_records = JMESPathGen .search (jpath , gentype (input_records ))
172181
173- # Apply xform to both entire return value and each returned element
182+ # Apply xform to both entire return value
174183 input_records = xform (input_records )
175184
176185 return input_records
@@ -218,21 +227,107 @@ def _print_stats(record, stats):
218227 return record
219228
220229
221- def convert (input_path , input_type , output_path , output_type , split = None , jpath = '' , stats = None ):
230+ def to_strings (v ):
231+ """
232+ Helper to recursively convert Generators to lists, stringifing all else
233+ :param v: Parent object/list
234+ :return: list/dict with all children converted to the same or a string
235+ """
236+ if isinstance (v , str ):
237+ return v
238+
239+ if isinstance (v , (types .GeneratorType , map , filter , tuple )):
240+ v = list (v )
241+
242+ if hasattr (v , 'keys' ):
243+ keys = v .keys ()
244+ elif hasattr (v , '__getitem__' ):
245+ keys = range (len (v ))
246+ else :
247+ return str (v )
248+
249+ for i in keys :
250+ v [i ] = to_strings (v [i ])
251+ return v
252+
253+
254+ def to_dicts (v ):
255+ """
256+ Helper to recursively convert Objects and Generators to dicts and lists
257+ :param v: Parent object/list
258+ :return: list/dict with all children converted to the same
259+ """
260+ if isinstance (v , str ):
261+ try :
262+ return int (v )
263+ except ValueError :
264+ pass
265+ return v
266+
267+ if isinstance (v , (types .GeneratorType , map , filter , tuple )):
268+ v = list (v )
269+
270+ if isinstance (v , SeqIO .SeqRecord ):
271+ v = {
272+ ** v .__dict__ ,
273+ 'seq' : str (v .seq )
274+ }
275+ del v ['_seq' ]
276+ elif isinstance (v , SeqFeature .FeatureLocation ):
277+ v = {
278+ ** v .__dict__ ,
279+ 'start' : v .start ,
280+ 'end' : v .end ,
281+ 'strand' : v .strand ,
282+ }
283+ del v ['_start' ]
284+ del v ['_end' ]
285+ del v ['_strand' ]
286+ elif isinstance (v , SeqFeature .AbstractPosition ):
287+ return to_dicts (str (v ))
288+ elif isinstance (v , OrderedDict ):
289+ v = dict (v )
290+ elif hasattr (v , '__dict__' ):
291+ v = v .__dict__
292+
293+ if hasattr (v , 'keys' ):
294+ keys = v .keys ()
295+ elif hasattr (v , '__getitem__' ):
296+ keys = range (len (v ))
297+ else :
298+ return v
299+
300+ for i in keys :
301+ v [i ] = to_dicts (v [i ])
302+ return v
303+
304+
305+ def convert (input_path : pathlib .Path , input_type : str , output_path : pathlib .Path , output_type : str , split : bool = False , jpath : str = '' , stats = None ):
306+ """
307+ Convert document from one format to another, optionally querying via JMESPath or splitting into separate outputs
308+ :param input_path: Path to input dataset
309+ :param input_type: Format of input dataset
310+ :param output_path: Path to output dataset
311+ :param output_type: Format of output dataset
312+ :param split: Split each record into a different output dataset. Adds index suffix to output path.
313+ :param jpath: JMESPath query to apply to input dataset before outputting
314+ :param stats: File handle to output GFF3 summary of output records
315+ :return: None
316+ """
222317 xform = _to_SeqRecord
223318 with input_path .open ("r" ) as handle :
224319 if output_type == 'text' :
225- writer = lambda r , fh , t : fh .write ("\n " .join (map (str , r )) + "\n " )
320+ writer = lambda records , fh , t : fh .write ("\n " .join (map (str , to_strings ( records ) )) + "\n " )
226321 xform = lambda x : x
227322 elif output_type == 'json' :
228323 import json
229- writer = lambda r , fh , t : json .dump (tuple ( r ), fh , skipkeys = True )
230- xform = lambda x : x
324+ writer = lambda records , fh , t : json .dump (to_dicts ( records ), fh , skipkeys = True , indent = True )
325+ xform = _allow_single
231326 elif output_type in ('yml' , 'yaml' ):
232327 from ruamel .yaml import YAML
233328 yml = YAML (typ = 'unsafe' )
234- writer = lambda r , fh , t : yml .dump (tuple ( r ), fh )
235- xform = lambda x : x
329+ writer = lambda records , fh , t : yml .dump (to_dicts ( records ), fh )
330+ xform = _allow_single
236331 elif output_type in gff_types :
237332 writer = gff_writer
238333 else :
0 commit comments