BioPython-Convert/biopython_convert/__init__.py at 4027a328fa3e9fc020e6a1e9fa0d94af218c9d24 · brinkmanlab/BioPython-Convert · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
"""
Biopython Convert
Convert between any formats that Biopython supports or gffutils.
Provides a means of querying/filtering documents using JMESPath query language.
"""
import sys
import pathlib
import itertools
import types
from collections import defaultdict

import getopt
from typing import Callable, Generator, OrderedDict

from Bio import SeqIO, StreamModeError, SeqFeature, Seq
import gffutils
from gffutils import biopython_integration

from . import JMESPathGen

gff_types = ['gff', 'gff3']
extended_types = ['text', 'json', 'yaml', 'yml']
SeqIO_types = ['abi', 'abi-trim', 'ace', 'cif-atom', 'cif-seqres', 'clustal', 'embl', 'fasta', 'fasta-2line',
               'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'genbank', 'gb', 'ig', 'imgt', 'nexus',
               'pdb-seqres', 'pdb-atom', 'phd', 'phylip', 'pir', 'seqxml', 'sff', 'sff-trim', 'stockholm', 'swiss',
               'tab', 'qual', 'uniprot-xml']
stat_annotations = ['molecule_type', 'topology', 'data_file_division', 'date', 'accessions', 'sequence_version', 'gi',
                    'keywords', 'source', 'organism']

JMESPathGenOptions = JMESPathGen.Options(custom_functions=JMESPathGen.ExtendedFunctions(), custom_slice_types=(SeqIO.SeqRecord,))

usage = """\
Use: biopython.convert [-s] [-v] [-i] [-q JMESPath] input_file input_type output_file output_type
\t-s Split records into seperate files
\t-q JMESPath to select records. Must return list of SeqIO records. Root is list of input SeqIO records.
\t-i Print out details of records during conversion
\t-v Print version and exit
""" + "\nValid types: " + ', '.join(SeqIO_types + gff_types + extended_types) + "\n"


def get_args(sysargs: list):
    """
    Parse command line arguments
    :param sysargs: list of command line arguments (sys.argv[1:])
    :return: (input_path, input_type, output_path, output_type, split, jmespath, stats)
    """
    split = False
    jpath = None
    stats = None
    # Parse arguments
    try:
        opts, args = getopt.gnu_getopt(sysargs, 'vsiq:')
        for opt, val in opts:
            if opt == '-v':
                from . import __version
                print(__version.__version__)
                exit(0)
            elif opt == '-s':
                split = True
            elif opt == '-q':
                if not val:
                    raise getopt.GetoptError("JMESPath must not be empty", "-q")
                jpath = val
            elif opt == '-i':
                stats = sys.stdout

    except getopt.GetoptError as err:
        print("Argument error(" + str(err.opt) + "): " + err.msg, file=sys.stderr)
        args = []

    # Check for minimum number of arguments
    if len(args) < 4:
        print(usage, file=sys.stderr)
        exit(1)

    input_path = pathlib.Path(args[0])
    input_type = args[1]
    output_path = pathlib.Path(args[2])
    output_type = args[3]

    return input_path, input_type, output_path, output_type, split, jpath, stats


def to_stats(record: SeqIO.SeqRecord) -> str:
    """
    Build GFF record representing summary of SeqRecord
    :param record: SeqIO.SeqRecord to represent
    :return: string containing GFF record
    """
    # 'if' statements can be removed after https://github.com/daler/gffutils/pull/144
    if record.name:
        attributes = {'Name': [record.name]}
    else:
        attributes = {}
    for k, v in record.annotations.items():
        if k in stat_annotations:
            if isinstance(v, list):
                v = [str(a) for a in v]
            else:
                v = [str(v)]
            if v:
                attributes[k] = v

    feat_count = defaultdict(int)
    for f in record.features:
        if f.type == 'source':
            # Include source coordinate
            attributes[f"source__location"] = [str(part) for part in f.location.parts]
            # Include source qualifiers
            for k, v in f.qualifiers.items():
                attr = attributes.get(f"source_{k}", [])
                if not isinstance(attr, list):
                    attr = [attr]
                attr.append(v)
                attributes[f"source_{k}"] = attr
        # Count features of each type
        feat_count[f.type] += 1
    attributes['features'] = [f"{k}:{v}" for k, v in feat_count.items()]

    if record.description:
        attributes['desc'] = [record.description]
    return str(gffutils.Feature(record.id, "biopython.convert", "sequence", start=1, end=len(record), attributes=attributes))


def _allow_single(records):
    """
    Helper to allow returing single record from JMESPath
    :param records: SeqIO.SeqRecord instance
    :return: tuple containing SeqIO.SeqRecord
    """
    if isinstance(records, SeqIO.SeqRecord):
        # Support returning single record from JMESPath
        return (records,)
    return records


def _to_SeqRecord(record):
    """
    Support generating a single new record in JMESPath
    :param record: dict of SeqRecord constructor parameters
    :return: SeqRecord
    """
    seq = record.get('seq', {})
    del record['seq']
    if isinstance(seq, str):
        seq = Seq.Seq(seq)
    elif isinstance(seq, dict):
        seq = Seq.Seq(**seq)
    return SeqIO.SeqRecord(seq=seq, **record)


def _to_SeqRecords(records):
    """
    Helper to convert all output records to SeqRecords
    :param records: dict or SeqRecord
    :return: SeqRecord
    """
    if isinstance(records, dict):
        records = _to_SeqRecord(records)

    records = _allow_single(records)

    return map(lambda r: _to_SeqRecord(r) if isinstance(r, dict) else r, records)

def get_records(input_handle, input_type: str, jpath: str = '', xform: Callable = _to_SeqRecords):
    """
    Read in records and apply optional jmespath
    :param input_handle: File handle to read data from
    :param input_type: one of abi,abi-trim,ace,cif-atom,cif-seqres,clustal,embl,fasta,fasta-2line,fastq-sanger,fastq,
        fastq-solexa,fastq-illumina,genbank,gb,ig,imgt,nexus,pdb-seqres,pdb-atom,phd,phylip,pir,seqxml,sff,sff-trim,
        stockholm,swiss,tab,qual,uniprot-xml,gff3
    :param jpath: JMESPath selecting records to keep. The root is the list of records. The path must return a list of records.
    :param xform: Callable that takes the result of the jmespath and does anything necessary to convert to a iterable of output records
    :return: iterable of resulting records
    """
    def gentype(x):
        # Newer versions of biopython return iterator that is not of type Generator, wrap in generator type so jmespath can detect
        for a in x:
            yield a

    if input_type in gff_types:
        # If input is GFF load with gffutils library
        db = gffutils.create_db(input_handle, ":memory:", merge_strategy="create_unique")
        # Wrap features in generator that converts to BioPython SeqRecords
        input_records = map(
            lambda x: SeqIO.SeqRecord("", features=list(x)),
            itertools.groupby(
                map(biopython_integration.to_seqfeature, db.all_features(order_by="seqid")),
                lambda x: x.id
            )
        )
    else:
        input_records = SeqIO.parse(input_handle, input_type)

    # Wrap input in JMESPath selector if provided
    if jpath:
        input_records = JMESPathGen.search(jpath, gentype(input_records), JMESPathGenOptions)

    # Apply xform to both entire return value
    input_records = xform(input_records)

    return input_records


def _generate_suffixes(path: pathlib.Path) -> Generator[pathlib.Path, None, None]:
    """
    Helper to generate a new file path from a base path on each iteration
    :param path: base path to add suffix to
    :return: new path
    """
    i = 0
    while True:
        # Open output file with file name suffix if splitting
        yield path.with_suffix(f".{i}{path.suffix}")
        i += 1


def gff_writer(records: [SeqIO.SeqRecord], handle, output_type: str):
    """
    Convert SeqRecord to gffutils GFF3 record and output to handle
    :param handle: file handle to write to
    :param records: iterable of SeqRecord instances
    :param output_type: output format, ignored
    :return: None
    """
    for record in records:
        # TODO extend gffutils SeqFeature support
        for feature in record.features:
            feature = biopython_integration.from_seqfeature(feature)
            feature.seqid = record.id
            feature.source = 'biopython.convert'
            print(feature, file=handle)


def _print_stats(record, stats):
    """
    Helper to print stats of record
    :param record: SeqRecord to print stats of
    :param stats: IO handle or None
    :return: record, unaltered
    """
    if stats and isinstance(record, SeqIO.SeqRecord):
        print(to_stats(record), file=stats)
    return record


def to_strings(v):
    """
    Helper to recursively convert Generators to lists, stringifing all else
    :param v: Parent object/list
    :return: list/dict with all children converted to the same or a string
    """
    if isinstance(v, str):
        return v

    if isinstance(v, (types.GeneratorType, map, filter, tuple)):
        v = list(v)

    if hasattr(v, 'keys'):
        keys = v.keys()
    elif hasattr(v, '__getitem__'):
        keys = range(len(v))
    else:
        return str(v)

    for i in keys:
        v[i] = to_strings(v[i])
    return v


def to_dicts(v):
    """
    Helper to recursively convert Objects and Generators to dicts and lists
    :param v: Parent object/list
    :return: list/dict with all children converted to the same
    """
    try:
        position_class = SeqFeature.Position
    except AttributeError:
        position_class = SeqFeature.AbstractPosition
    if isinstance(v, str):
        try:
            return int(v)
        except ValueError:
            pass
        return v

    if isinstance(v, Seq.Seq):
        return str(v)

    if isinstance(v, (types.GeneratorType, map, filter, tuple)):
        v = list(v)

    if isinstance(v, SeqIO.SeqRecord):
        v = {
            **v.__dict__,
            'seq': str(v.seq)
        }
        del v['_seq']
    elif isinstance(v, SeqFeature.FeatureLocation):
        v = {
            **v.__dict__,
            'start': v.start,
            'end': v.end,
            'strand': v.strand,
        }
        del v['_start']
        del v['_end']
        del v['_strand']
    elif isinstance(v, position_class):
        return to_dicts(str(v))
    elif isinstance(v, (OrderedDict, defaultdict)):
        v = dict(v)
    elif hasattr(v, '__dict__'):
        v = v.__dict__

    if hasattr(v, 'keys'):
        keys = v.keys()
    elif hasattr(v, '__getitem__'):
        keys = range(len(v))
    else:
        return v

    for i in keys:
        v[i] = to_dicts(v[i])
    return v


def convert(input_path: pathlib.Path, input_type: str, output_path: pathlib.Path, output_type: str, split: bool = False, jpath: str = '', stats=None):
    """
    Convert document from one format to another, optionally querying via JMESPath or splitting into separate outputs
    :param input_path: Path to input dataset
    :param input_type: Format of input dataset
    :param output_path: Path to output dataset
    :param output_type: Format of output dataset
    :param split: Split each record into a different output dataset. Adds index suffix to output path.
    :param jpath: JMESPath query to apply to input dataset before outputting
    :param stats: File handle to output GFF3 summary of output records
    :return: None
    """
    xform = _to_SeqRecords
    with input_path.open("r") as handle:
        if output_type == 'text':
            writer = lambda records, fh, t: fh.write("\n".join(map(str, to_strings(records))) + "\n")
            xform = lambda x: x
        elif output_type == 'json':
            import json
            writer = lambda records, fh, t: json.dump(to_dicts(records), fh, skipkeys=True, indent=True)
            xform = _allow_single
        elif output_type in ('yml', 'yaml'):
            from ruamel.yaml import YAML
            yml = YAML(typ='unsafe')
            writer = lambda records, fh, t: yml.dump(to_dicts(records), fh)
            xform = _allow_single
        elif output_type in gff_types:
            writer = gff_writer
        else:
            writer = SeqIO.write

        if stats:
            print("##gff-version 3", file=stats)

        seq_records = get_records(handle, input_type, jpath, xform)
        binary = ''
        if split:
            for record, path in zip(seq_records, _generate_suffixes(output_path)):
                _print_stats(record, stats)
                while True:
                    try:
                        with path.open('w' + binary) as output_handle:
                            writer((record,), output_handle, output_type)
                    except StreamModeError:
                        if binary == 'b':
                            raise
                        binary = 'b'
                        continue
                    except Seq.UndefinedSequenceError:
                        path.unlink(True)
                    break
        else:
            while True:
                try:
                    with output_path.open('w' + binary) as output_handle:
                        writer(
                            map(
                                lambda r: _print_stats(r, stats),
                                seq_records
                            ),
                            output_handle,
                            output_type
                        )
                except StreamModeError:
                    if binary == 'b':
                        raise
                    binary = 'b'
                    continue
                except Seq.UndefinedSequenceError:
                    pass
                break