Merge pull request #33 from MuhammedHasan/master

Muhammed Hasan · web-flow · commit d30eb1bb06e5 · 2019-05-31T00:13:55.000+02:00
Split variant based on interval start-end if fixed_len, fixes #32
diff --git a/kipoiseq/extractors/vcf_seq.py b/kipoiseq/extractors/vcf_seq.py
@@ -88,13 +88,9 @@ def extract(self, interval, variants, anchor, fixed_len=True):
             which to query the sequence. 0-based
           variants List[cyvcf2.Variant]: variants overlapping the `interval`.
             can also be indels. 1-based
-          anchor: position w.r.t. the interval start. (0-based). E.g.
-            for an interval of `chr1:10-20` the anchor of 0 denotes
-            the point chr1:10 in the 0-based coordinate system. Similarly,
-            `anchor=5` means the anchor point is right in the middle
-            of the sequence e.g. first half of the sequence (5nt) will be
-            upstream of the anchor and the second half (5nt) will be
-            downstream of the anchor.
+          anchor: absolution position w.r.t. the interval start. (0-based).
+            E.g. for an interval of `chr1:10-20` the anchor of 10 denotes
+            the point chr1:10 in the 0-based coordinate system.
           fixed_len: if True, the return sequence will have the same length
             as the `interval` (e.g. `interval.end - interval.start`)
 
@@ -106,7 +102,16 @@ def extract(self, interval, variants, anchor, fixed_len=True):
         variant_pairs = self._variant_to_sequence(variants)
 
         # 1. Split variants overlapping with anchor
-        variant_pairs = list(self._split_overlapping(variant_pairs, anchor))
+        # and interval start end if not fixed_len
+        variant_pairs = self._split_overlapping(variant_pairs, anchor)
+
+        if not fixed_len:
+            variant_pairs = self._split_overlapping(
+                variant_pairs, interval.start, which='right')
+            variant_pairs = self._split_overlapping(
+                variant_pairs, interval.end, which='left')
+
+        variant_pairs = list(variant_pairs)
 
         # 2. split the variants into upstream and downstream
         # and sort the variants in each interval
@@ -168,15 +173,17 @@ def _variant_to_sequence(self, variants):
                            start=v.start, end=v.start + len(v.ALT[0]))
             yield ref, alt
 
-    def _split_overlapping(self, variant_pairs, anchor):
+    def _split_overlapping(self, variant_pairs, anchor, which='both'):
         """
         Split the variants hitting the anchor into two
         """
         for ref, alt in variant_pairs:
             if ref.start < anchor < ref.end or alt.start < anchor < alt.end:
                 mid = anchor - ref.start
-                yield ref[:mid], alt[:mid]
-                yield ref[mid:], alt[mid:]
+                if which == 'left' or which == 'both':
+                    yield ref[:mid], alt[:mid]
+                if which == 'right' or which == 'both':
+                    yield ref[mid:], alt[mid:]
             else:
                 yield ref, alt
 
@@ -201,7 +208,7 @@ def _downstream_builder(self, down_variants, interval, anchor, istart):
 
         prev = anchor
         for ref, alt in down_variants:
-            if ref.end <= istart:
+            if ref.end < istart:
                 break
             down_sb.append(Interval(interval.chrom, ref.end, prev))
             down_sb.append(alt)
@@ -239,14 +246,28 @@ def _cut_to_fix_len(self,  down_str, up_str, interval, anchor):
 
 
 class BaseVCFSeqExtractor(BaseExtractor):
+    """
+    Base class to fetch sequence in which variants applied based
+    on given vcf file.
+    """
+
     def __init__(self, fasta_file, vcf_file):
+        """
+        Args:
+          fasta_file: path to the fasta file (can be gzipped)
+          vcf_file: path to the fasta file (need be bgzipped and indexed)
+        """
         self.fasta_file = fasta_file
         self.vcf_file = vcf_file
         self.variant_extractor = VariantSeqExtractor(fasta_file)
         self.vcf = MultiSampleVCF(vcf_file)
 
 
 class SingleVariantVCFSeqExtractor(BaseVCFSeqExtractor):
+    """
+    Fetch list of sequence in which each variant applied based
+    on given vcf file.
+    """
 
     def extract(self, interval, anchor=None, sample_id=None, fixed_len=True):
         for variant in self.vcf.fetch_variants(interval, sample_id):
@@ -257,6 +278,9 @@ def extract(self, interval, anchor=None, sample_id=None, fixed_len=True):
 
 
 class SingleSeqVCFSeqExtractor(BaseVCFSeqExtractor):
+    """
+    Fetch sequence in which all variant applied based on given vcf file.
+    """
 
     def extract(self, interval, anchor=None, sample_id=None, fixed_len=True):
         return self.variant_extractor.extract(
diff --git a/tests/extractors/test_vcf_seq_extractor.py b/tests/extractors/test_vcf_seq_extractor.py
@@ -92,15 +92,14 @@ def test__split_overlapping(variant_seq_extractor):
 
 
 def test_extract(variant_seq_extractor):
-    interval = Interval('chr1', 2, 9)
-
     variants = list(VCF(vcf_file)())
+
+    interval = Interval('chr1', 2, 9)
     seq = variant_seq_extractor.extract(interval, variants, anchor=5)
     assert len(seq) == interval.end - interval.start
     assert seq == 'GCGAACG'
 
     interval = Interval('chr1', 2, 9, strand='-')
-    variants = list(VCF(vcf_file)())
     seq = variant_seq_extractor.extract(interval, variants, anchor=5)
     assert len(seq) == interval.end - interval.start
     assert seq == 'CGTTCGC'
@@ -140,6 +139,11 @@ def test_extract(variant_seq_extractor):
     assert len(seq) == interval.end - interval.start
     assert seq == 'AACGTAACGT'
 
+    interval = Interval('chr1', 5, 11, strand='+')
+    seq = variant_seq_extractor.extract(
+        interval, variants, anchor=10, fixed_len=False)
+    assert seq == 'AACGTAA'
+
 
 @pytest.fixture
 def single_variant_vcf_seq_extractor():