From acb580ffd4481691444740f8ecfa575ae2c722f6 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Tue, 28 Apr 2020 23:58:21 +0200
Subject: [PATCH 01/25] do the data repack into the ontology containers with
 further RDF 'turtle' format dump

---
 data/chrk_ath_12samples_10kb.w100000_S.json |   2 +-
 matrixcomponent/JSONparser.py               |   2 +-
 matrixcomponent/PangenomeSchematic.py       |  90 +++++++-
 matrixcomponent/matrix.py                   |   1 +
 matrixcomponent/ontology.py                 | 230 ++++++++++++++++++++
 segmentation.py                             |  20 +-
 6 files changed, 338 insertions(+), 7 deletions(-)
 create mode 100644 matrixcomponent/ontology.py

diff --git a/data/chrk_ath_12samples_10kb.w100000_S.json b/data/chrk_ath_12samples_10kb.w100000_S.json
index 3f89fd3..94b34f5 100644
--- a/data/chrk_ath_12samples_10kb.w100000_S.json
+++ b/data/chrk_ath_12samples_10kb.w100000_S.json
@@ -1,4 +1,4 @@
-{"odgi_version": 10,"bin_width": 100000,"pangenome_length": 210026186}
+{"odgi_version": 12,"bin_width": 100000,"pangenome_length": 210026186}
 {"bin_id":1}
 {"bin_id":2}
 {"bin_id":3}
diff --git a/matrixcomponent/JSONparser.py b/matrixcomponent/JSONparser.py
index f04fc61..a246da8 100644
--- a/matrixcomponent/JSONparser.py
+++ b/matrixcomponent/JSONparser.py
@@ -37,7 +37,7 @@ def process_path(line=None):
             if type(ranges) is not list and len(b) >= 6:
                 ranges = [[b[4], b[5]]]
 
-            bin = matrix.Bin(b[0], b[1], b[2], ranges)
+            bin = matrix.Bin(b[0], b[1], b[2], ranges, 0) # path_id = 0
             p.bins.setdefault(bin.bin_id, bin)
 
         p.links = np.asarray(path['links'], dtype='int32')
diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 1f3d11b..0b18e09 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -10,9 +10,11 @@
 
 from dataclasses import dataclass
 
-from matrixcomponent import JSON_VERSION
+from matrixcomponent import JSON_VERSION, ontology
 from matrixcomponent.matrix import Component, Bin, LinkColumn
 
+from rdflib import Graph, Namespace, URIRef
+
 from DNASkittleUtils.Contigs import Contig, write_contigs_to_file
 
 @dataclass
@@ -50,7 +52,7 @@ def update_first_last_bin(self):
         self.first_bin = 1  # these have not been properly initialized
         self.last_bin = self.components[-1].last_bin
 
-    def split_and_write(self, cells_per_file, folder, fasta : Contig):
+    def split_and_write(self, cells_per_file, folder, fasta: Contig, ontology_folder):
         """Splits one Schematic into multiple files with their own
         unique first and last_bin based on the volume of data desired per
         file specified by cells_per_file.  """
@@ -96,6 +98,87 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig):
                                          "first_bin": schematic.first_bin,
                                          "last_bin": schematic.last_bin})
 
+                if ontology_folder:
+                    # going from top to bottom
+                    zoom_level = ontology.ZoomLevel()
+                    zoom_level.zoom_factor = schematic.bin_width
+                    zoom_level.ns = URIRef('pg/')
+
+                    cell_counter = 0
+                    obin_dict = {}
+                    for ic, component in enumerate(schematic.components):
+                        ocomp = ontology.Component(ic + 1)
+                        zoom_level.components.append(ocomp)
+
+                        # bins
+                        for bins in component.matrix:
+                            for bin in bins:
+                                if bin:
+                                    obin = ontology.Bin()
+                                    obin.bin_rank = bin.bin_id
+                                    obin.path_id  = bin.path_id # saved in the populate_component_matrix
+                                    obin_dict[bin.bin_id] = obin
+                                    ocomp.bins.append(obin)
+
+                                    cell_counter = cell_counter + 1
+                                    ocell = ontology.Cell()
+                                    ocell.id = cell_counter
+                                    ocell.inversion_percent = bin.inversion
+                                    ocell.position_percent = bin.coverage
+
+                                    for [begin, end] in bin.nucleotide_ranges:
+                                        oregion = ontology.Region()
+                                        oregion.begin = begin
+                                        oregion.end = end
+                                        ocell.cell_region.append(oregion)
+
+                                    obin.cells.append(ocell)
+
+                    # links between components and their bins
+                    for component, ocomp in zip(schematic.components, zoom_level.components):
+                        # links: arrivals
+                        link_counter = 0
+                        for link in component.arrivals:
+                            link_counter = link_counter + 1
+                            olink = ontology.Link()
+                            olink.id = link_counter
+
+                            if link.upstream in obin_dict:
+                                olink.departure = obin_dict[link.upstream]
+
+                            if link.downstream in obin_dict:
+                                olink.arrival   = obin_dict[link.downstream]
+
+                            olink.paths = (link.participants + 1).tolist()
+                            ocomp.links.append(olink)
+
+                        for link in component.departures:
+                            link_counter = link_counter + 1
+                            olink = ontology.Link()
+                            olink.id = link_counter
+
+                            if link.upstream in obin_dict:
+                                olink.departure = obin_dict[link.upstream]
+
+                            if link.downstream in obin_dict:
+                                olink.arrival = obin_dict[link.downstream]
+
+                            olink.paths = (link.participants + 1).tolist()
+                            ocomp.links.append(olink)
+
+
+                    g = Graph()
+                    vg = Namespace('http://biohackathon.org/resource/vg#')
+                    faldo = Namespace('http://biohackathon.org/resource/faldo#')
+                    g.bind('vg', vg)
+                    g.bind('faldo', faldo)
+
+                    zoom_level.add_to_graph(g, vg, faldo)  # here the magic happens
+
+                    p = ontology_folder.joinpath(schematic.ttl_filename(i))
+                    g.serialize(destination=str(p), format='turtle', encoding='utf-8')
+
+
         return bin2file_mapping
 
     def find_cut_points_in_file_split(self, bins_per_file, column_counts):
@@ -134,6 +217,9 @@ def filename(self, nth_file):
     def fasta_filename(self, nth_file):
         return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa'
 
+    def ttl_filename(self, nth_file):
+        return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.ttl'
+
     def write_index_file(self, folder, bin2file_mapping):
 
         file_contents = {'bin_width': self.bin_width,
diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py
index 269753d..b57c1f5 100644
--- a/matrixcomponent/matrix.py
+++ b/matrixcomponent/matrix.py
@@ -12,6 +12,7 @@ class Bin:
     coverage: float
     inversion: float
     nucleotide_ranges: List[List[int]]
+    path_id: int
     sequence: str = ''
 
 ## Path is all for input files
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
new file mode 100644
index 0000000..2d070cc
--- /dev/null
+++ b/matrixcomponent/ontology.py
@@ -0,0 +1,230 @@
+from typing import List
+from rdflib import Namespace, Graph, Literal, URIRef, RDF
+
+
+class Path:
+    id: str
+    ns: URIRef
+
+    def __init__(self):
+        ns = Namespace("") # should be empty!
+
+    def __str__(self):
+        return "path" + str(self.id)  # path1
+
+    def ns_term(self):
+        return self.ns + "/" + str(self)
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        path = self.ns + str(self)  # str representation
+
+        # add the object itself
+#        graph.add((path, RDF.type, faldo.Path))
+
+        # add its properties, recursively if needed
+
+
+class Region:
+    ns: URIRef
+    begin: int
+    end: int
+
+    def __str__(self):
+        return "region/" + str(self.begin) + "-" + str(self.end)  # region/2-10
+
+    def ns_term(self):
+        return self.ns + str(self) # path1/...
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        region = self.ns_term()  # str representation
+
+        # add the object itself
+        graph.add( (region, RDF.type, faldo.Region) )
+
+        # add its properties, recursively if needed
+        graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) )
+        graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) )
+
+
+class Cell:
+    id: int
+    path_id: int
+    ns: URIRef
+    position_percent: float
+    inversion_percent: float
+    cell_region: List[Region] # [[1,4], [7,11]]
+
+    def __init__(self):
+        self.cell_region = []
+
+    def __str__(self):
+        return "cell" + str(self.id)  # cell1
+
+    def ns_term(self):
+        return self.ns + str(self)
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        cell = self.ns_term()
+        inner_ns = URIRef("path" + str(self.path_id) + "/")
+
+        # add the object itself
+        graph.add((cell, RDF.type, vg.Cell))
+
+        # add its properties, recursively if needed
+        graph.add((cell, vg.positionPercent, Literal(self.position_percent)))
+        graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent)))
+        for region in self.cell_region:
+            region.ns = inner_ns
+            graph.add((cell, vg.cellRegion, region.ns_term()))  # can have multiple bins
+            region.add_to_graph(graph, vg, faldo)
+
+
+class Bin:
+    ns: URIRef
+    bin_rank: int
+    path_id: int
+    bin_edge: object # pointer to the next Bin
+    cells: List[Region]
+
+    def __init__(self):
+        self.bin_edge = None
+        self.cells = []
+
+    def __str__(self):
+        return "bin" + str(self.bin_rank)  # bin1
+
+    def ns_term(self):
+        return self.ns + str(self)
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        bin = self.ns_term()
+        inner_ns = bin + "/"
+
+        # add the object itself
+        graph.add((bin, RDF.type, vg.Bin))
+
+        # add its properties, recursively if needed
+        graph.add((bin, vg.binRank, Literal(self.bin_rank)))
+        for cell in self.cells:
+            cell.ns = inner_ns
+            cell.path_id = self.path_id
+            graph.add((bin, vg.cells, cell.ns_term()))  # can have multiple bins
+            cell.add_to_graph(graph, vg, faldo)
+
+        # add the reference to another object if needed
+        if self.bin_edge is not None:
+            graph.add((inner_ns, vg.binEdge, Literal(self.bin_edge))) # __str__ will be called
+
+
+class Link:
+    id: int
+    ns: URIRef
+    arrival: Bin
+    departure: Bin
+    paths: List[Path]
+
+    def __init__(self):
+        self.paths = []
+        self.arrival = None
+        self.departure = None
+
+    def __str__(self):
+        return "link" + str(self.id)  # bin1
+
+
+    def ns_term(self):
+        return self.ns + str(self)
+
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        link = self.ns_term()
+        inner_ns = link + "/"
+
+        # add the object itself
+        graph.add((link, RDF.type, vg.Link))
+
+        # add its properties, recursively if needed
+        if self.arrival:
+            self.arrival.ns = self.ns
+            graph.add((link, vg.arrival, self.arrival.ns_term()))
+
+        if self.departure:
+            self.departure.ns = self.ns
+            graph.add((link, vg.departure, self.departure.ns_term()))
+
+        for path in self.paths:
+            graph.add((link, vg.linkPaths, URIRef("path" + str(path))))  # can have multiple bins
+
+
+class Component:
+    id: int
+    ns: URIRef
+    component_edge: object # Component
+    component_rank: int
+    bins: List[Bin]
+    links: List[Link]
+
+    def __init__(self, id):
+        self.id = id
+        self.component_edge = None
+        self.bins = []
+        self.links = []
+
+    def __str__(self):
+        return "component" + str(self.id) # component1
+
+    def ns_term(self):
+        return self.ns + self.__str__()
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        component = self.ns_term()
+        inner_ns = component + "/"
+
+        # add the object itself
+        graph.add((component, RDF.type, vg.Component))
+
+        # add its properties, recursively if needed
+        graph.add((component, vg.componentRank, Literal(self.id)))
+        for bin in self.bins:
+            bin.ns = inner_ns
+            graph.add((component, vg.bins, bin.ns_term())) # can have multiple bins
+            bin.add_to_graph(graph, vg, faldo) # add the inner content of each
+
+        for link in self.links:
+            link.ns = inner_ns
+            link.add_to_graph(graph, vg, faldo) # add the inner content of each
+
+
+        # add the reference to another object if needed
+        if self.component_edge is not None:
+            graph.add((component, vg.componentEdge, Literal(self.component_edge))) # __str__ will be called
+
+
+class ZoomLevel:
+    id: str
+    ns: URIRef
+    zoom_factor: int
+    components: List[Component]
+
+    def __init__(self):
+        self.components = []
+
+    def __str__(self):
+        return "zoom" +  str(self.zoom_factor) #zoom1000
+
+    def ns_term(self):
+        return self.ns + self.__str__()
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        zoomfactor = self.ns_term()
+        inner_ns = zoomfactor + "/"
+
+        # add the object itself
+        graph.add((zoomfactor, RDF.type, vg.ZoomLevel))
+
+        # add its properties, recursively if needed
+        graph.add((zoomfactor, vg.zoomFactor, Literal(self.zoom_factor)))
+        for i,comp in enumerate(self.components):
+            comp.ns = inner_ns
+            graph.add((zoomfactor, vg.components, comp.ns_term()))
+            comp.add_to_graph(graph, vg, faldo)
diff --git a/segmentation.py b/segmentation.py
index c779ddb..1ac535a 100644
--- a/segmentation.py
+++ b/segmentation.py
@@ -67,6 +67,7 @@ def populate_component_matrix(paths: List[Path], schematic: PangenomeSchematic):
             # this case enforces first_bin == last_bin --- comp.matrix[p] has a single element
             for comp, fr in zip(comp_filtered, from_filtered):
                 bin = values[fr]
+                bin.path_id = p # save for later
                 comp.matrix[p] = [bin]
                 comp.occupants[p] = bin.coverage > 0.1
 
@@ -287,14 +288,16 @@ def _split_lines(self, text, width):
         return argparse.HelpFormatter._split_lines(self, text, width)
 
 
-def write_files(folder, odgi_fasta: Path, schematic: PangenomeSchematic):
+def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic):
     os.makedirs(folder, exist_ok=True)  # make directory for all files
+    if ontology_folder:
+        os.makedirs(ontology_folder, exist_ok=True)
 
     fasta = None
     if odgi_fasta:
         fasta = read_contigs(odgi_fasta)[0]
 
-    bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta)
+    bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta, ontology_folder)
 
     schematic.write_index_file(folder, bin2file_mapping)
 
@@ -345,6 +348,12 @@ def get_arguments():
                         type=int,
                         help='Tip: do not set this one to more than available CPU cores)')
 
+    parser.add_argument('-t', '--do-ttl',
+                        dest='do_ttl',
+                        default=False,
+                        type=bool,
+                        help='do the ontology turtle output or not)')
+
     args = parser.parse_args()
 
     # file path logic for single or list of files with wildcard *
@@ -392,7 +401,12 @@ def main():
         # this one spits out json and optionally other output files (fasta, ttl)
         path_name = str(bin_width)
         folder_path = osPath(args.output_folder).joinpath(path_name)  # full path
-        write_files(folder_path, args.fasta, schematic)
+
+        ontology_folder_path = None
+        if args.do_ttl:
+            ontology_folder_path = osPath(args.output_folder).joinpath(path_name + '-turtle')
+
+        write_files(folder_path, ontology_folder_path, args.fasta, schematic)
 
         LOGGER.info("Finished processing the file " + json_file)
 

From 44caccc57629ef148db3aaebb575efabd39ed5d6 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Thu, 30 Apr 2020 23:46:54 +0200
Subject: [PATCH 02/25] further ontology fixes: real path names; better
 forward* and reverse* edges definition for Component and Bin containers;
 Links are a part of ZoomLevel component

---
 matrixcomponent/PangenomeSchematic.py | 111 +++++++++++++++++++++++++-
 matrixcomponent/matrix.py             |   4 +
 matrixcomponent/ontology.py           | 100 +++++++++++------------
 3 files changed, 160 insertions(+), 55 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index daa78ed..6df6516 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -10,7 +10,9 @@
 
 from dataclasses import dataclass
 
-from matrixcomponent import JSON_VERSION
+from rdflib import URIRef, Graph, Namespace
+
+from matrixcomponent import JSON_VERSION, ontology
 from matrixcomponent.matrix import Component, Bin, LinkColumn
 
 from DNASkittleUtils.Contigs import Contig, write_contigs_to_file
@@ -56,7 +58,7 @@ def update_first_last_bin(self):
         self.first_bin = 1  # these have not been properly initialized
         self.last_bin = self.components[-1].last_bin
 
-    def split_and_write(self, cells_per_file, folder, fasta : Contig):
+    def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folder):
         """Splits one Schematic into multiple files with their own
         unique first and last_bin based on the volume of data desired per
         file specified by cells_per_file.  """
@@ -100,6 +102,108 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig):
                     c = folder.joinpath(schematic.fasta_filename(i))
                     write_contigs_to_file(c, chunk)
 
+                if ontology_folder:
+                    zoom_level = ontology.ZoomLevel()
+                    zoom_level.zoom_factor = schematic.bin_width
+                    zoom_level.ns = URIRef('pg/')
+
+                    prev_comp_id = -1
+                    cell_counter = 0
+                    ocomp_dict = {}
+                    obin_dict = {}
+                    for ic, component in enumerate(schematic.components):
+                        ocomp = ontology.Component(ic+1)
+                        ocomp.ns = zoom_level.ns_term() + '/'
+                        zoom_level.components.append(ocomp)
+
+                        # save the sequence 1-2-3-..-n as a bi-directed list
+                        if prev_comp_id in ocomp_dict:
+                            prev_comp = ocomp_dict[prev_comp_id]
+                            ocomp.reverse_component_edge = prev_comp.ns_term()
+                            prev_comp.forward_component_edge = ocomp.ns_term()
+
+                        ocomp_dict[ic] = ocomp
+                        prev_comp_id = ic
+
+                        # bins
+                        for bins in component.matrix:
+                            prev_bin_id = -1
+                            for bin in bins:
+                                if bin:
+                                    cur_bin_id = bin.bin_id
+                                    obin = ontology.Bin()
+                                    obin.ns = ocomp.ns_term() + '/'
+                                    obin.bin_rank = cur_bin_id
+                                    obin_dict[cur_bin_id] = obin
+
+                                    # save the sequence 1-2-3-..-n as a bi-directed list
+                                    if prev_bin_id in obin_dict:
+                                        prev_bin = obin_dict[prev_bin_id]
+                                        prev_bin.forward_bin_edge = obin.ns_term()
+                                        obin.reverse_bin_edge = prev_bin.ns_term()
+
+                                    prev_bin_id = cur_bin_id
+                                    ocomp.bins.append(obin)
+
+                                    cell_counter = cell_counter + 1
+                                    ocell = ontology.Cell()
+                                    ocell.id = cell_counter
+                                    ocell.path_id = self.path_names[bin.path_id]  # saved in the populate_component_matrix
+                                    ocell.inversion_percent = bin.inversion
+                                    ocell.position_percent = bin.coverage
+
+                                    # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
+                                    for [begin, end] in bin.nucleotide_ranges:
+                                        oregion = ontology.Region()
+                                        oregion.begin = begin
+                                        oregion.end = end
+                                        ocell.cell_region.append(oregion)
+
+                                    obin.cells.append(ocell)
+
+                    # links between components and their bins
+
+                    all_links = []
+                    for component in schematic.components:
+                        # search in both arrivals and departures of the component <-> component links
+                        all_links.extend(component.arrivals + component.departures)
+
+                    link_counter = 0
+                    for link in set(all_links):  # no duplications; we use Link.__hash__() here - not nice ..
+                        if len(link.participants):
+                            link_counter = link_counter + 1
+                            olink = ontology.Link()
+                            olink.id = link_counter
+
+                            from_bin = None
+                            to_bin = None
+                            if link.upstream in obin_dict:
+                                from_bin = obin_dict[link.upstream]
+                                from_bin.forward_bin_edge = link.downstream
+
+                            if link.downstream in obin_dict:
+                                to_bin = obin_dict[link.downstream]
+                                olink.arrival = to_bin.ns_term()
+
+                            if from_bin and to_bin:
+                                from_bin.forward_bin_edge = to_bin.ns_term()
+                                to_bin.reverse_bin_edge = from_bin.ns_term()
+
+                            olink.paths = [self.path_names[k] for k in link.participants]
+                            zoom_level.links.append(olink)
+
+                    g = Graph()
+                    vg = Namespace('http://biohackathon.org/resource/vg#')
+                    faldo = Namespace('http://biohackathon.org/resource/faldo#')
+                    g.bind('vg', vg)
+                    g.bind('faldo', faldo)
+
+                    zoom_level.add_to_graph(g, vg, faldo)  # here the magic happens
+
+                    p = ontology_folder.joinpath(schematic.ttl_filename(i))
+                    g.serialize(destination=str(p), format='turtle', encoding='utf-8')
+
+
         return bin2file_mapping
 
     def find_cut_points_in_file_split(self, columns_per_file, column_counts):
@@ -129,6 +233,9 @@ def filename(self, nth_file):
     def fasta_filename(self, nth_file):
         return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa'
 
+    def ttl_filename(self, nth_file):
+        return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.ttl'
+
     def write_index_file(self, folder, bin2file_mapping):
 
         file_contents = {'bin_width': self.bin_width,
diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py
index 2b15624..7a14874 100644
--- a/matrixcomponent/matrix.py
+++ b/matrixcomponent/matrix.py
@@ -36,6 +36,10 @@ class LinkColumn:
     num_paths: int
     participants: 'numpy.array' # ids of participated path_names
 
+    # todo: not more than 2^32 bins are supported - refactor the ontology code processing the Link items
+    def __hash__(self):
+        return (self.upstream << 32) + self.downstream
+
 
 class Component:
     """Block of co-linear variation within a Graph Matrix
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 2d070cc..3a3299b 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -2,28 +2,6 @@
 from rdflib import Namespace, Graph, Literal, URIRef, RDF
 
 
-class Path:
-    id: str
-    ns: URIRef
-
-    def __init__(self):
-        ns = Namespace("") # should be empty!
-
-    def __str__(self):
-        return "path" + str(self.id)  # path1
-
-    def ns_term(self):
-        return self.ns + "/" + str(self)
-
-    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
-        path = self.ns + str(self)  # str representation
-
-        # add the object itself
-#        graph.add((path, RDF.type, faldo.Path))
-
-        # add its properties, recursively if needed
-
-
 class Region:
     ns: URIRef
     begin: int
@@ -48,7 +26,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 
 class Cell:
     id: int
-    path_id: int
+    path_id: str
     ns: URIRef
     position_percent: float
     inversion_percent: float
@@ -65,7 +43,7 @@ def ns_term(self):
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         cell = self.ns_term()
-        inner_ns = URIRef("path" + str(self.path_id) + "/")
+        inner_ns = URIRef(self.path_id + "/")
 
         # add the object itself
         graph.add((cell, RDF.type, vg.Cell))
@@ -82,13 +60,14 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 class Bin:
     ns: URIRef
     bin_rank: int
-    path_id: int
-    bin_edge: object # pointer to the next Bin
+    forward_bin_edge: str
+    reverse_bin_edge: str
     cells: List[Region]
 
     def __init__(self):
-        self.bin_edge = None
         self.cells = []
+        self.forward_bin_edge = ''
+        self.reverse_bin_edge = ''
 
     def __str__(self):
         return "bin" + str(self.bin_rank)  # bin1
@@ -107,26 +86,32 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         graph.add((bin, vg.binRank, Literal(self.bin_rank)))
         for cell in self.cells:
             cell.ns = inner_ns
-            cell.path_id = self.path_id
             graph.add((bin, vg.cells, cell.ns_term()))  # can have multiple bins
             cell.add_to_graph(graph, vg, faldo)
 
         # add the reference to another object if needed
-        if self.bin_edge is not None:
-            graph.add((inner_ns, vg.binEdge, Literal(self.bin_edge))) # __str__ will be called
+        if self.forward_bin_edge:
+            graph.add((bin, vg.forwardBinEdge, URIRef(self.forward_bin_edge)))
+
+        if self.reverse_bin_edge:
+            graph.add((bin, vg.reverseBinEdge, URIRef(self.reverse_bin_edge)))
 
 
 class Link:
     id: int
     ns: URIRef
-    arrival: Bin
-    departure: Bin
-    paths: List[Path]
+    arrival: str
+    departure: str
+    paths: List[str]
+    forward_link_edge: int
+    reverse_link_edge: int
 
     def __init__(self):
         self.paths = []
-        self.arrival = None
-        self.departure = None
+        self.forward_link_edge = -1
+        self.reverse_link_edge = -1
+        self.arrival = ''
+        self.departure = ''
 
     def __str__(self):
         return "link" + str(self.id)  # bin1
@@ -145,30 +130,35 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 
         # add its properties, recursively if needed
         if self.arrival:
-            self.arrival.ns = self.ns
-            graph.add((link, vg.arrival, self.arrival.ns_term()))
+            graph.add((link, vg.arrival, URIRef(self.arrival)))
 
         if self.departure:
-            self.departure.ns = self.ns
-            graph.add((link, vg.departure, self.departure.ns_term()))
+            graph.add((link, vg.departure, URIRef(self.departure)))
 
         for path in self.paths:
-            graph.add((link, vg.linkPaths, URIRef("path" + str(path))))  # can have multiple bins
+            graph.add((link, vg.linkPaths, URIRef(path)))  # can have multiple bins
+
+        # add the reference to another object if needed
+        if self.forward_link_edge > -1:
+            graph.add((link, vg.forwardLinkEdge, self.ns + "link" + str(self.forward_link_edge)))
+
+        if self.reverse_link_edge > -1:
+            graph.add((link, vg.reverseLinkEdge, self.ns + "link" + str(self.reverse_link_edge)))
 
 
 class Component:
     id: int
     ns: URIRef
-    component_edge: object # Component
+    forward_component_edge: str  # id of the next Component
+    reverse_component_edge: str  # id of the previous Component
     component_rank: int
     bins: List[Bin]
-    links: List[Link]
 
     def __init__(self, id):
         self.id = id
-        self.component_edge = None
         self.bins = []
-        self.links = []
+        self.forward_component_edge = ''
+        self.reverse_component_edge = ''
 
     def __str__(self):
         return "component" + str(self.id) # component1
@@ -187,17 +177,15 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         graph.add((component, vg.componentRank, Literal(self.id)))
         for bin in self.bins:
             bin.ns = inner_ns
-            graph.add((component, vg.bins, bin.ns_term())) # can have multiple bins
-            bin.add_to_graph(graph, vg, faldo) # add the inner content of each
-
-        for link in self.links:
-            link.ns = inner_ns
-            link.add_to_graph(graph, vg, faldo) # add the inner content of each
-
+            graph.add((component, vg.bins, bin.ns_term()))  # can have multiple bins
+            bin.add_to_graph(graph, vg, faldo)  # add the inner content of each
 
         # add the reference to another object if needed
-        if self.component_edge is not None:
-            graph.add((component, vg.componentEdge, Literal(self.component_edge))) # __str__ will be called
+        if self.forward_component_edge:
+            graph.add((component, vg.forwardComponentEdge, URIRef(self.forward_component_edge)))
+
+        if self.reverse_component_edge:
+            graph.add((component, vg.reverseComponentEdge, URIRef(self.reverse_component_edge)))
 
 
 class ZoomLevel:
@@ -205,9 +193,11 @@ class ZoomLevel:
     ns: URIRef
     zoom_factor: int
     components: List[Component]
+    links: List[Link]
 
     def __init__(self):
         self.components = []
+        self.links = []
 
     def __str__(self):
         return "zoom" +  str(self.zoom_factor) #zoom1000
@@ -228,3 +218,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
             comp.ns = inner_ns
             graph.add((zoomfactor, vg.components, comp.ns_term()))
             comp.add_to_graph(graph, vg, faldo)
+
+        for link in self.links:
+            link.ns = inner_ns
+            link.add_to_graph(graph, vg, faldo)

From 29550fdd10617c76e8c5e25e5cb6feb74067e916 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Mon, 11 May 2020 16:01:39 +0200
Subject: [PATCH 03/25] - do not forget to store the path_id - use
 faldo.ExactPosition when appropriate

---
 matrixcomponent/ontology.py | 22 +++++++++++++++-------
 segmentation.py             |  1 +
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 3a3299b..00bf6f9 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -8,7 +8,10 @@ class Region:
     end: int
 
     def __str__(self):
-        return "region/" + str(self.begin) + "-" + str(self.end)  # region/2-10
+        suffix = str(self.begin)  # region/2
+        if self.begin != self.end:
+            suffix = suffix + "-" + str(self.end)  # region/2-10
+        return "region/" + suffix
 
     def ns_term(self):
         return self.ns + str(self) # path1/...
@@ -16,12 +19,17 @@ def ns_term(self):
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         region = self.ns_term()  # str representation
 
-        # add the object itself
-        graph.add( (region, RDF.type, faldo.Region) )
+        if self.begin == self.end:
+            # add the object itself
+            graph.add((region, RDF.type, faldo.ExactPosition))
+            graph.add((region, faldo.position, Literal(self.begin)))
+        else:
+            # add the object itself
+            graph.add( (region, RDF.type, faldo.Region) )
 
-        # add its properties, recursively if needed
-        graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) )
-        graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) )
+            # add its properties, recursively if needed
+            graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) )
+            graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) )
 
 
 class Cell:
@@ -36,7 +44,7 @@ def __init__(self):
         self.cell_region = []
 
     def __str__(self):
-        return "cell" + str(self.id)  # cell1
+        return "cell" + self.path_id # cell1
 
     def ns_term(self):
         return self.ns + str(self)
diff --git a/segmentation.py b/segmentation.py
index 3f5baf6..7fc3046 100644
--- a/segmentation.py
+++ b/segmentation.py
@@ -86,6 +86,7 @@ def populate_component_matrix(paths: List[Path], schematic: PangenomeSchematic):
                 sliced = values[fr:to]
                 for bin in sliced:
                     padded[bin.bin_id - fb] = bin # do not create objects, simply link them
+                    bin.path_id = p  # save for later
                 comp.matrix[p] = padded
                 comp.occupants[p] = any([bin.coverage > 0.1 for bin in sliced])
 

From b8d0e162ab424ea953d5a4836886898ec802c1a2 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Wed, 13 May 2020 00:07:53 +0200
Subject: [PATCH 04/25] - explicit faldo:ExactPosition containers - every Link
 has linkRank numbered after the pair sort (component.id,
 [component.departures])

---
 matrixcomponent/PangenomeSchematic.py | 57 ++++++++++++++++-----------
 matrixcomponent/ontology.py           | 48 +++++++++++++++-------
 2 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 6df6516..acbd38f 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -111,6 +111,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folde
                     cell_counter = 0
                     ocomp_dict = {}
                     obin_dict = {}
+                    oposition_dict = {}
                     for ic, component in enumerate(schematic.components):
                         ocomp = ontology.Component(ic+1)
                         ocomp.ns = zoom_level.ns_term() + '/'
@@ -153,44 +154,54 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folde
                                     ocell.position_percent = bin.coverage
 
                                     # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
+                                    cell_ns = URIRef(ocell.path_id + "/")
                                     for [begin, end] in bin.nucleotide_ranges:
                                         oregion = ontology.Region()
                                         oregion.begin = begin
                                         oregion.end = end
                                         ocell.cell_region.append(oregion)
 
+                                        oposition_begin = ontology.Position(begin, cell_ns)
+                                        oposition_end = ontology.Position(end, cell_ns)
+                                        oposition_dict[oposition_begin.ns_term()] = oposition_begin
+                                        oposition_dict[oposition_end.ns_term()] = oposition_end
+
                                     obin.cells.append(ocell)
 
                     # links between components and their bins
 
-                    all_links = []
+                    olink_dict = {}
+                    link_counter = 0
                     for component in schematic.components:
-                        # search in both arrivals and departures of the component <-> component links
-                        all_links.extend(component.arrivals + component.departures)
+                        # search in all arrivals component <-> component links; departures are iterated automatically
+                        # every link from departures will be in some other arrival
+                        for link in component.departures:
+                            if len(link.participants):
+                                link_counter = link_counter + 1
+                                olink = ontology.Link()
+                                olink.id = link_counter
+                                olink_dict[link_counter] = olink
 
-                    link_counter = 0
-                    for link in set(all_links):  # no duplications; we use Link.__hash__() here - not nice ..
-                        if len(link.participants):
-                            link_counter = link_counter + 1
-                            olink = ontology.Link()
-                            olink.id = link_counter
 
-                            from_bin = None
-                            to_bin = None
-                            if link.upstream in obin_dict:
-                                from_bin = obin_dict[link.upstream]
-                                from_bin.forward_bin_edge = link.downstream
+                    link_counter = 0
+                    for component in schematic.components:
+                        # search in all arrivals component <-> component links; departures are iterated automatically
+                        # every link from departures will be in some other arrival
+                        for link in component.departures:
+                            if len(link.participants):
+                                link_counter = link_counter + 1
+                                olink = olink_dict[link_counter]
 
-                            if link.downstream in obin_dict:
-                                to_bin = obin_dict[link.downstream]
-                                olink.arrival = to_bin.ns_term()
+                                if link.upstream in obin_dict:
+                                    from_bin = obin_dict[link.upstream]
+                                    olink.departure = from_bin.ns_term()
 
-                            if from_bin and to_bin:
-                                from_bin.forward_bin_edge = to_bin.ns_term()
-                                to_bin.reverse_bin_edge = from_bin.ns_term()
+                                if link.downstream in obin_dict:
+                                    to_bin = obin_dict[link.downstream]
+                                    olink.arrival = to_bin.ns_term()
 
-                            olink.paths = [self.path_names[k] for k in link.participants]
-                            zoom_level.links.append(olink)
+                                olink.paths = [self.path_names[k] for k in link.participants]
+                                zoom_level.links.append(olink)
 
                     g = Graph()
                     vg = Namespace('http://biohackathon.org/resource/vg#')
@@ -199,6 +210,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folde
                     g.bind('faldo', faldo)
 
                     zoom_level.add_to_graph(g, vg, faldo)  # here the magic happens
+                    for oposition in oposition_dict.values():
+                        oposition.add_to_graph(g, vg, faldo)
 
                     p = ontology_folder.joinpath(schematic.ttl_filename(i))
                     g.serialize(destination=str(p), format='turtle', encoding='utf-8')
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 00bf6f9..402d41e 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -2,16 +2,37 @@
 from rdflib import Namespace, Graph, Literal, URIRef, RDF
 
 
+class Position:
+    id: int
+    ns: URIRef
+
+    def __init__(self, id, ns):
+        self.id = id
+        self.ns = ns
+
+    def __str__(self):
+        return str(self.id)
+
+    def ns_term(self):
+        return self.ns + str(self) # path1/2
+
+    def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
+        position = self.ns_term()  # str representation
+
+        # add the object itself
+        graph.add((position, RDF.type, faldo.ExactPosition))
+
+        # add its properties, recursively if needed
+        graph.add((position, faldo.position, Literal(self.id)))
+
+
 class Region:
     ns: URIRef
     begin: int
     end: int
 
     def __str__(self):
-        suffix = str(self.begin)  # region/2
-        if self.begin != self.end:
-            suffix = suffix + "-" + str(self.end)  # region/2-10
-        return "region/" + suffix
+        return str(self.begin) + "-" + str(self.end)  # 2-10
 
     def ns_term(self):
         return self.ns + str(self) # path1/...
@@ -19,17 +40,12 @@ def ns_term(self):
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         region = self.ns_term()  # str representation
 
-        if self.begin == self.end:
-            # add the object itself
-            graph.add((region, RDF.type, faldo.ExactPosition))
-            graph.add((region, faldo.position, Literal(self.begin)))
-        else:
-            # add the object itself
-            graph.add( (region, RDF.type, faldo.Region) )
+        # add the object itself
+        graph.add((region, RDF.type, faldo.Region))
 
-            # add its properties, recursively if needed
-            graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) )
-            graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) )
+        # add its properties, recursively if needed
+        graph.add((region, faldo.begin, self.ns + str(self.begin)))
+        graph.add((region, faldo.end, self.ns + str(self.end)))
 
 
 class Cell:
@@ -61,7 +77,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent)))
         for region in self.cell_region:
             region.ns = inner_ns
-            graph.add((cell, vg.cellRegion, region.ns_term()))  # can have multiple bins
+            graph.add((cell, vg.cellRegions, region.ns_term()))  # can have multiple regions
             region.add_to_graph(graph, vg, faldo)
 
 
@@ -137,6 +153,8 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         graph.add((link, RDF.type, vg.Link))
 
         # add its properties, recursively if needed
+        graph.add((link, vg.linkRank, Literal(self.id)))
+
         if self.arrival:
             graph.add((link, vg.arrival, URIRef(self.arrival)))
 

From 26e69015b3b20397801e601a0c898f99cdf4f74a Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Tue, 16 Jun 2020 11:48:53 +0200
Subject: [PATCH 05/25] write faldo:ForwardStrandPosition, faldo:position and
 faldo:reference in each Position. Add Path write out

---
 matrixcomponent/PangenomeSchematic.py | 22 ++++++++++++------
 matrixcomponent/ontology.py           | 33 ++++++++++++++++++++++++---
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index f401d7f..db9777e 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -166,7 +166,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                         # bins
                         for bins in component.matrix:
                             prev_bin_id = -1
-                            for bin in bins:
+                            for bin in bins[1][1]:  # follow the compressed format
                                 if bin:
                                     cur_bin_id = bin.bin_id
                                     obin = ontology.Bin()
@@ -192,14 +192,19 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
 
                                     # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
                                     cell_ns = URIRef(ocell.path_id + "/")
-                                    for [begin, end] in bin.nucleotide_ranges:
+                                    for i in range(0, len(bin.nucleotide_ranges), 2):
+                                        begin, end = bin.nucleotide_ranges[i], bin.nucleotide_ranges[i+1]
+                                        real_begin = begin if begin else end
+                                        real_end = end if end else begin
+
                                         oregion = ontology.Region()
-                                        oregion.begin = begin
-                                        oregion.end = end
+                                        oregion.begin = real_begin
+                                        oregion.end = real_end
                                         ocell.cell_region.append(oregion)
 
-                                        oposition_begin = ontology.Position(begin, cell_ns)
-                                        oposition_end = ontology.Position(end, cell_ns)
+                                        path = self.path_names[bin.path_id]
+                                        oposition_begin = ontology.Position(real_begin, begin <= end, path, cell_ns)
+                                        oposition_end = ontology.Position(real_end, begin <= end, path, cell_ns)
                                         oposition_dict[oposition_begin.ns_term()] = oposition_begin
                                         oposition_dict[oposition_end.ns_term()] = oposition_end
 
@@ -246,9 +251,12 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                     g.bind('vg', vg)
                     g.bind('faldo', faldo)
 
-                    zoom_level.add_to_graph(g, vg, faldo)  # here the magic happens
+                    # here the magic happens
+                    zoom_level.add_to_graph(g, vg, faldo)
                     for oposition in oposition_dict.values():
                         oposition.add_to_graph(g, vg, faldo)
+                    for path in self.path_names:
+                        ontology.Path(path).add_to_graph(g, vg, faldo)
 
                     p = ontology_folder.joinpath(schematic.ttl_filename(i))
                     g.serialize(destination=str(p), format='turtle', encoding='utf-8')
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 402d41e..487a7ce 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -2,12 +2,32 @@
 from rdflib import Namespace, Graph, Literal, URIRef, RDF
 
 
+class Path:
+    path: str
+
+    def __init__(self, path):
+        self.path = path
+
+    def ns_term(self):
+        return self.path # path1
+
+    def add_to_graph(self, graph: Graph, vg: Namespace, faldo: Namespace) -> None:
+        path = URIRef(self.ns_term())  # str representation
+
+        # add the object itself
+        graph.add((path, RDF.type, vg.Path))
+
+
 class Position:
     id: int
+    is_forward: bool
+    path: str
     ns: URIRef
 
-    def __init__(self, id, ns):
+    def __init__(self, id, is_forward, path, ns):
         self.id = id
+        self.is_forward = is_forward
+        self.path = path
         self.ns = ns
 
     def __str__(self):
@@ -21,9 +41,14 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 
         # add the object itself
         graph.add((position, RDF.type, faldo.ExactPosition))
+        if (self.is_forward):
+            graph.add((position, RDF.type, faldo.ForwardStrandPosition))
+        else:
+            graph.add((position, RDF.type, faldo.BackwardStrandPosition))
 
         # add its properties, recursively if needed
         graph.add((position, faldo.position, Literal(self.id)))
+        graph.add((position, faldo.reference, URIRef(self.path)))
 
 
 class Region:
@@ -44,8 +69,10 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         graph.add((region, RDF.type, faldo.Region))
 
         # add its properties, recursively if needed
-        graph.add((region, faldo.begin, self.ns + str(self.begin)))
-        graph.add((region, faldo.end, self.ns + str(self.end)))
+        real_begin = self.begin if self.begin else self.end
+        real_end   = self.end if self.end else self.begin
+        graph.add((region, faldo.begin, self.ns + str(real_begin)))
+        graph.add((region, faldo.end, self.ns + str(real_end)))
 
 
 class Cell:

From db95d1d4f03d1c65710afd552db5a3f6fa3f63d8 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Tue, 16 Jun 2020 14:57:34 +0200
Subject: [PATCH 06/25] create ontology folder when needed

---
 segmentation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/segmentation.py b/segmentation.py
index e144a3c..cb36fd4 100644
--- a/segmentation.py
+++ b/segmentation.py
@@ -284,6 +284,8 @@ def _split_lines(self, text, width):
 
 def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic, no_adjacent_links):
     os.makedirs(folder, exist_ok=True)  # make directory for all files
+    if ontology_folder:
+        os.makedirs(ontology_folder, exist_ok=True)
 
     fasta = None
     if odgi_fasta:

From fb1a3ec36e881bca2ac8c46a0ba49cfbd8689ac7 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Tue, 16 Jun 2020 16:06:14 +0200
Subject: [PATCH 07/25] proper name and strand directions

---
 matrixcomponent/PangenomeSchematic.py | 4 ++--
 matrixcomponent/ontology.py           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index db9777e..9227946 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -203,8 +203,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                                         ocell.cell_region.append(oregion)
 
                                         path = self.path_names[bin.path_id]
-                                        oposition_begin = ontology.Position(real_begin, begin <= end, path, cell_ns)
-                                        oposition_end = ontology.Position(real_end, begin <= end, path, cell_ns)
+                                        oposition_begin = ontology.Position(real_begin, begin <= end | end == 0, path, cell_ns)
+                                        oposition_end = ontology.Position(real_end, begin <= end | end == 0, path, cell_ns)
                                         oposition_dict[oposition_begin.ns_term()] = oposition_begin
                                         oposition_dict[oposition_end.ns_term()] = oposition_end
 
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 487a7ce..7b1a68b 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -44,7 +44,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         if (self.is_forward):
             graph.add((position, RDF.type, faldo.ForwardStrandPosition))
         else:
-            graph.add((position, RDF.type, faldo.BackwardStrandPosition))
+            graph.add((position, RDF.type, faldo.ReverseStrandPosition))
 
         # add its properties, recursively if needed
         graph.add((position, faldo.position, Literal(self.id)))

From 8e233eca77f358528ceef2ece19c6182580c15b0 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Tue, 16 Jun 2020 16:22:16 +0200
Subject: [PATCH 08/25] directions

---
 matrixcomponent/PangenomeSchematic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 9227946..d616139 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -203,8 +203,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                                         ocell.cell_region.append(oregion)
 
                                         path = self.path_names[bin.path_id]
-                                        oposition_begin = ontology.Position(real_begin, begin <= end | end == 0, path, cell_ns)
-                                        oposition_end = ontology.Position(real_end, begin <= end | end == 0, path, cell_ns)
+                                        oposition_begin = ontology.Position(real_begin, begin >= end, path, cell_ns)
+                                        oposition_end = ontology.Position(real_end, begin >= end, path, cell_ns)
                                         oposition_dict[oposition_begin.ns_term()] = oposition_begin
                                         oposition_dict[oposition_end.ns_term()] = oposition_end
 

From d0bff9ade29f84e33875e38988dc5c980aad9d2e Mon Sep 17 00:00:00 2001
From: subwaystation <simon.heumos@qbic.uni-tuebingen.de>
Date: Wed, 17 Jun 2020 21:21:58 +0200
Subject: [PATCH 09/25] fix bin_edge in .ttl output; but might be slow

---
 matrixcomponent/PangenomeSchematic.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index d616139..7229470 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -149,6 +149,10 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                     ocomp_dict = {}
                     obin_dict = {}
                     oposition_dict = {}
+
+                    min_bin_id = schematic.first_bin
+                    max_bin_id = schematic.last_bin
+
                     for ic, component in enumerate(schematic.components):
                         ocomp = ontology.Component(ic+1)
                         ocomp.ns = zoom_level.ns_term() + '/'
@@ -165,7 +169,6 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
 
                         # bins
                         for bins in component.matrix:
-                            prev_bin_id = -1
                             for bin in bins[1][1]:  # follow the compressed format
                                 if bin:
                                     cur_bin_id = bin.bin_id
@@ -174,13 +177,17 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                                     obin.bin_rank = cur_bin_id
                                     obin_dict[cur_bin_id] = obin
 
-                                    # save the sequence 1-2-3-..-n as a bi-directed list
-                                    if prev_bin_id in obin_dict:
-                                        prev_bin = obin_dict[prev_bin_id]
-                                        prev_bin.forward_bin_edge = obin.ns_term()
+                                    if (cur_bin_id > min_bin_id):
+                                        prev_bin = ontology.Bin()
+                                        prev_bin.ns = ocomp.ns_term() + '/'
+                                        prev_bin.bin_rank = cur_bin_id - 1
                                         obin.reverse_bin_edge = prev_bin.ns_term()
+                                    if (cur_bin_id < max_bin_id):
+                                        next_bin = ontology.Bin()
+                                        next_bin.ns = ocomp.ns_term() + '/'
+                                        next_bin.bin_rank = cur_bin_id + 1
+                                        obin.forward_bin_edge = next_bin.ns_term()
 
-                                    prev_bin_id = cur_bin_id
                                     ocomp.bins.append(obin)
 
                                     cell_counter = cell_counter + 1

From 3741e10314486f1eb42ad37a3383220af3ec0968 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Thu, 18 Jun 2020 10:23:33 +0200
Subject: [PATCH 10/25] recycle temp objects, do not create new ones

---
 matrixcomponent/PangenomeSchematic.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 7229470..f7199ab 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -167,26 +167,26 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                         ocomp_dict[ic] = ocomp
                         prev_comp_id = ic
 
+                        obin_ns = ocomp.ns_term() + '/'
+                        obin_tmp = ontology.Bin()
+                        obin_tmp.ns = obin_ns
+
                         # bins
                         for bins in component.matrix:
                             for bin in bins[1][1]:  # follow the compressed format
                                 if bin:
                                     cur_bin_id = bin.bin_id
                                     obin = ontology.Bin()
-                                    obin.ns = ocomp.ns_term() + '/'
+                                    obin.ns = obin_ns
                                     obin.bin_rank = cur_bin_id
                                     obin_dict[cur_bin_id] = obin
 
                                     if (cur_bin_id > min_bin_id):
-                                        prev_bin = ontology.Bin()
-                                        prev_bin.ns = ocomp.ns_term() + '/'
-                                        prev_bin.bin_rank = cur_bin_id - 1
-                                        obin.reverse_bin_edge = prev_bin.ns_term()
+                                        obin_tmp.bin_rank = cur_bin_id - 1
+                                        obin.reverse_bin_edge = obin_tmp.ns_term()  # string value
                                     if (cur_bin_id < max_bin_id):
-                                        next_bin = ontology.Bin()
-                                        next_bin.ns = ocomp.ns_term() + '/'
-                                        next_bin.bin_rank = cur_bin_id + 1
-                                        obin.forward_bin_edge = next_bin.ns_term()
+                                        obin_tmp.bin_rank = cur_bin_id + 1
+                                        obin.forward_bin_edge = obin_tmp.ns_term()  # string value
 
                                     ocomp.bins.append(obin)
 

From d50210bbceb105ed230db4dc8dcf356ede838c2d Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Thu, 18 Jun 2020 17:58:12 +0200
Subject: [PATCH 11/25] positionPercent and inversionPercent are printed as
 doubles

---
 matrixcomponent/ontology.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 7b1a68b..0f7833d 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -1,5 +1,5 @@
 from typing import List
-from rdflib import Namespace, Graph, Literal, URIRef, RDF
+from rdflib import Namespace, Graph, Literal, URIRef, RDF, XSD
 
 
 class Path:
@@ -100,8 +100,8 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         graph.add((cell, RDF.type, vg.Cell))
 
         # add its properties, recursively if needed
-        graph.add((cell, vg.positionPercent, Literal(self.position_percent)))
-        graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent)))
+        graph.add((cell, vg.positionPercent, Literal(self.position_percent, datatype=XSD.double)))
+        graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent, datatype=XSD.double)))
         for region in self.cell_region:
             region.ns = inner_ns
             graph.add((cell, vg.cellRegions, region.ns_term()))  # can have multiple regions

From 3f4984c1ab56e1424cbf8d674e2b9cbc5b790e29 Mon Sep 17 00:00:00 2001
From: subwaystation <simon.heumos@qbic.uni-tuebingen.de>
Date: Mon, 22 Jun 2020 14:37:03 +0200
Subject: [PATCH 12/25] fix orientation

---
 matrixcomponent/PangenomeSchematic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index f7199ab..b5e403d 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -210,8 +210,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                                         ocell.cell_region.append(oregion)
 
                                         path = self.path_names[bin.path_id]
-                                        oposition_begin = ontology.Position(real_begin, begin >= end, path, cell_ns)
-                                        oposition_end = ontology.Position(real_end, begin >= end, path, cell_ns)
+                                        oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns)
+                                        oposition_end = ontology.Position(real_end, begin < end, path, cell_ns)
                                         oposition_dict[oposition_begin.ns_term()] = oposition_begin
                                         oposition_dict[oposition_end.ns_term()] = oposition_end
 

From a96974e4317b0febb532a9da8516a5b13d946ee7 Mon Sep 17 00:00:00 2001
From: subwaystation <simon.heumos@qbic.uni-tuebingen.de>
Date: Wed, 24 Jun 2020 17:49:25 +0200
Subject: [PATCH 13/25] add base IRI

---
 matrixcomponent/PangenomeSchematic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index b5e403d..0aeabc0 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -142,7 +142,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                 if ontology_folder:
                     zoom_level = ontology.ZoomLevel()
                     zoom_level.zoom_factor = schematic.bin_width
-                    zoom_level.ns = URIRef('pg/')
+                    zoom_level.ns = URIRef('http://example.org/pg/')
 
                     prev_comp_id = -1
                     cell_counter = 0

From 350e85d64f8a5d3b888c8fffde5d039fa01573ea Mon Sep 17 00:00:00 2001
From: subwaystation <simon.heumos@qbic.uni-tuebingen.de>
Date: Wed, 24 Jun 2020 22:52:40 +0200
Subject: [PATCH 14/25] added example SPARQL queries

---
 queries/selectBins1To5OfZoomlevel1.rq         | 24 +++++++++++++++++++
 .../selectLinksFilterBins1To5OfZoomLevel1.rq  | 15 ++++++++++++
 queries/selectPangenomeSeq1To5.rq             |  9 +++++++
 3 files changed, 48 insertions(+)
 create mode 100644 queries/selectBins1To5OfZoomlevel1.rq
 create mode 100644 queries/selectLinksFilterBins1To5OfZoomLevel1.rq
 create mode 100644 queries/selectPangenomeSeq1To5.rq

diff --git a/queries/selectBins1To5OfZoomlevel1.rq b/queries/selectBins1To5OfZoomlevel1.rq
new file mode 100644
index 0000000..8e71e99
--- /dev/null
+++ b/queries/selectBins1To5OfZoomlevel1.rq
@@ -0,0 +1,24 @@
+PREFIX vg: <http://biohackathon.org/resource/vg#>
+PREFIX faldo: <http://biohackathon.org/resource/faldo#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+SELECT ?cell ?cellregion ?inversionpercent ?positionpercent ?beginpos ?endpos ?path
+WHERE {
+        ?zoomlevel a vg:ZoomLevel;
+                vg:components ?components;
+                vg:zoomFactor ?zoomfactor .
+        FILTER(?zoomfactor = 1)
+        ?components vg:bins ?bin .
+        ?bin vg:cells ?cell;
+                vg:binRank ?binrank .
+        FILTER(?binrank < 6 && ?binrank > 0)
+        ?cell vg:cellRegions ?faldoregion;
+                vg:inversionPercent ?inversionpercent;
+                vg:positionPercent ?positionpercent .
+        ?faldoregion faldo:begin ?begin;
+                faldo:end ?end .
+        ?begin faldo:position ?beginpos .
+        ?end faldo:position ?endpos .
+        ?begin faldo:reference ?reference .
+        # THIS DOES NOT WORK BUT I DON'T KNOW WHY ?reference rdfs:label ?path .
+}
\ No newline at end of file
diff --git a/queries/selectLinksFilterBins1To5OfZoomLevel1.rq b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq
new file mode 100644
index 0000000..ab2cb21
--- /dev/null
+++ b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq
@@ -0,0 +1,15 @@
+PREFIX vg: <http://biohackathon.org/resource/vg#>
+PREFIX faldo: <http://biohackathon.org/resource/faldo#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+SELECT *
+WHERE {
+        ?link a vg:Link;
+                vg:arrival ?arrivalbin;
+                vg:departure ?departurebin;
+                vg:linkPaths ?path .
+        # THIS DOES NOT WORK AND I DON'T KNOW WHY ?path rdfs:label ?l .
+        ?arrivalbin vg:binRank ?arrivalbinrank .
+        ?departurebin vg:binRank ?departurebinrank .
+        FILTER((?arrivalbinrank < 6 && ?arrivalbinrank > 0) || (?departurebinrank < 6 && ?departurebinrank > 0))
+}
\ No newline at end of file
diff --git a/queries/selectPangenomeSeq1To5.rq b/queries/selectPangenomeSeq1To5.rq
new file mode 100644
index 0000000..464c299
--- /dev/null
+++ b/queries/selectPangenomeSeq1To5.rq
@@ -0,0 +1,9 @@
+PREFIX vg:<http://biohackathon.org/resource/vg#>
+PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+SELECT (SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq)
+{SELECT
+  * #(SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq)
+WHERE {?s a vg:Node;
+                rdf:value ?sequence .
+       }
+ORDER BY ?s
\ No newline at end of file

From d2eb34c6f0e17f93587752755979dd0115009804 Mon Sep 17 00:00:00 2001
From: subwaystation <simon.heumos@qbic.uni-tuebingen.de>
Date: Fri, 26 Jun 2020 14:25:51 +0200
Subject: [PATCH 15/25] Link -> ZoomLevel, replace pg with vg, add 'path/'

---
 matrixcomponent/PangenomeSchematic.py            |  3 ++-
 matrixcomponent/ontology.py                      |  6 +++++-
 queries/selectBins1To5OfZoomlevel1.rq            |  1 -
 queries/selectLinksFilterBins1To5OfZoomLevel1.rq | 11 +++++++----
 queries/selectPangenomeSeq1To5.rq                | 16 +++++++++-------
 5 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 0aeabc0..85bf5d8 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -142,7 +142,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                 if ontology_folder:
                     zoom_level = ontology.ZoomLevel()
                     zoom_level.zoom_factor = schematic.bin_width
-                    zoom_level.ns = URIRef('http://example.org/pg/')
+                    zoom_level.ns = URIRef('http://example.org/vg/')
 
                     prev_comp_id = -1
                     cell_counter = 0
@@ -250,6 +250,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                                     olink.arrival = to_bin.ns_term()
 
                                 olink.paths = [self.path_names[k] for k in link.participants]
+                                olink.linkZoomLevel = zoom_level.ns_term()
                                 zoom_level.links.append(olink)
 
                     g = Graph()
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 0f7833d..8d9b8e3 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -9,7 +9,7 @@ def __init__(self, path):
         self.path = path
 
     def ns_term(self):
-        return self.path # path1
+        return "path/" + self.path # path1
 
     def add_to_graph(self, graph: Graph, vg: Namespace, faldo: Namespace) -> None:
         path = URIRef(self.ns_term())  # str representation
@@ -156,6 +156,7 @@ class Link:
     paths: List[str]
     forward_link_edge: int
     reverse_link_edge: int
+    linkZoomLevel: str
 
     def __init__(self):
         self.paths = []
@@ -163,6 +164,7 @@ def __init__(self):
         self.reverse_link_edge = -1
         self.arrival = ''
         self.departure = ''
+        self.linkZoomLevel = ''
 
     def __str__(self):
         return "link" + str(self.id)  # bin1
@@ -182,6 +184,8 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         # add its properties, recursively if needed
         graph.add((link, vg.linkRank, Literal(self.id)))
 
+        if self.linkZoomLevel:
+            graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel)))
         if self.arrival:
             graph.add((link, vg.arrival, URIRef(self.arrival)))
 
diff --git a/queries/selectBins1To5OfZoomlevel1.rq b/queries/selectBins1To5OfZoomlevel1.rq
index 8e71e99..a5c1f5c 100644
--- a/queries/selectBins1To5OfZoomlevel1.rq
+++ b/queries/selectBins1To5OfZoomlevel1.rq
@@ -20,5 +20,4 @@ WHERE {
         ?begin faldo:position ?beginpos .
         ?end faldo:position ?endpos .
         ?begin faldo:reference ?reference .
-        # THIS DOES NOT WORK BUT I DON'T KNOW WHY ?reference rdfs:label ?path .
 }
\ No newline at end of file
diff --git a/queries/selectLinksFilterBins1To5OfZoomLevel1.rq b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq
index ab2cb21..f76b6a3 100644
--- a/queries/selectLinksFilterBins1To5OfZoomLevel1.rq
+++ b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq
@@ -2,14 +2,17 @@ PREFIX vg: <http://biohackathon.org/resource/vg#>
 PREFIX faldo: <http://biohackathon.org/resource/faldo#>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 
-SELECT *
+SELECT ?link ?path ?arrivalbinrank ?departurebinrank ?zoomfactor
 WHERE {
         ?link a vg:Link;
                 vg:arrival ?arrivalbin;
                 vg:departure ?departurebin;
-                vg:linkPaths ?path .
-        # THIS DOES NOT WORK AND I DON'T KNOW WHY ?path rdfs:label ?l .
+                vg:linkPaths ?path ;
+                vg:linkZoomLevel ?zoomlevel .
         ?arrivalbin vg:binRank ?arrivalbinrank .
         ?departurebin vg:binRank ?departurebinrank .
-        FILTER((?arrivalbinrank < 6 && ?arrivalbinrank > 0) || (?departurebinrank < 6 && ?departurebinrank > 0))
+        FILTER((?arrivalbinrank < 6 && ?arrivalbinrank > 0)
+                || (?departurebinrank < 6 && ?departurebinrank > 0))
+        ?zoomlevel vg:zoomFactor ?zoomfactor .
+        FILTER(?zoomfactor = 1)
 }
\ No newline at end of file
diff --git a/queries/selectPangenomeSeq1To5.rq b/queries/selectPangenomeSeq1To5.rq
index 464c299..4159e0a 100644
--- a/queries/selectPangenomeSeq1To5.rq
+++ b/queries/selectPangenomeSeq1To5.rq
@@ -1,9 +1,11 @@
 PREFIX vg:<http://biohackathon.org/resource/vg#>
 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-SELECT (SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq)
-{SELECT
-  * #(SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq)
-WHERE {?s a vg:Node;
-                rdf:value ?sequence .
-       }
-ORDER BY ?s
\ No newline at end of file
+
+SELECT (SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq) {
+    SELECT
+        *
+    WHERE {?s a vg:Node;
+        rdf:value ?sequence .
+    }
+    ORDER BY ?s
+}

From 209d6973562d0b3ece51de65d222ec7c69ec3d34 Mon Sep 17 00:00:00 2001
From: subwaystation <simon.heumos@qbic.uni-tuebingen.de>
Date: Fri, 26 Jun 2020 15:04:37 +0200
Subject: [PATCH 16/25] Actually emit the position percentage instead of the
 coverage of a bin.

---
 matrixcomponent/JSONparser.py         | 2 +-
 matrixcomponent/PangenomeSchematic.py | 2 +-
 matrixcomponent/matrix.py             | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/matrixcomponent/JSONparser.py b/matrixcomponent/JSONparser.py
index e56302d..51ed15b 100644
--- a/matrixcomponent/JSONparser.py
+++ b/matrixcomponent/JSONparser.py
@@ -41,7 +41,7 @@ def process_path(line=None):
             for r in ranges:
                 compressed_ranges.extend(r)
 
-            bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges, 0)
+            bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges, 0, b[3])
             p.bins.setdefault(bin.bin_id, bin)
 
         # do the major part of the segmentation.find_dividers() method
diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 85bf5d8..cf787e0 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -195,7 +195,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                                     ocell.id = cell_counter
                                     ocell.path_id = self.path_names[bin.path_id]  # saved in the populate_component_matrix
                                     ocell.inversion_percent = bin.inversion
-                                    ocell.position_percent = bin.coverage
+                                    ocell.position_percent = bin.position
 
                                     # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
                                     cell_ns = URIRef(ocell.path_id + "/")
diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py
index 6efe264..d2f3127 100644
--- a/matrixcomponent/matrix.py
+++ b/matrixcomponent/matrix.py
@@ -13,6 +13,7 @@ class Bin(recordclass.dataobject):
     inversion: float
     nucleotide_ranges: 'numpy.array' # List[List[int]] is encoded as a Numpy flat array - this saves memory
     path_id: int
+    position: float
 
 ## Path is all for input files
 

From 7dc028bbbe1ce91548886033e1906e709d1530a3 Mon Sep 17 00:00:00 2001
From: Dmytro Trybushnyi <dima_tr@yahoo.com>
Date: Sat, 4 Jul 2020 19:36:01 +0200
Subject: [PATCH 17/25] a big update: * parallel write out of the gzip
 compressed ontology files - no memory leaks due to the utilization of
 separate processes! * use the N-triples format to be 10x quicker than the
 Turtle (see format='nt' in PangenomeSchematic.py) * be gentle with the string
 variables, do not use "a"+"b" but rather "{0}{1}".format(a,b). This does not
 create small temporary object and leads to a lower memory fragmentation/leak
 * the RDF output folder is named *-rdf

---
 matrixcomponent/PangenomeSchematic.py | 328 +++++++++++++-------------
 matrixcomponent/matrix.py             |   4 -
 matrixcomponent/ontology.py           |  46 +---
 segmentation.py                       |   9 +-
 4 files changed, 177 insertions(+), 210 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index cf787e0..489b336 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -1,4 +1,8 @@
+import gzip
 import json
+import os
+import shutil
+
 from collections import OrderedDict
 from statistics import mean
 
@@ -10,6 +14,7 @@
 
 from dataclasses import dataclass
 
+from joblib import delayed
 from rdflib import URIRef, Graph, Namespace
 
 from matrixcomponent import JSON_VERSION, ontology
@@ -17,6 +22,144 @@
 
 from DNASkittleUtils.Contigs import Contig, write_contigs_to_file
 
+
+# move out of the class to be able to use with joblib
+def write_rdf(schematic, ontology_output_path):
+    zoom_level = ontology.ZoomLevel()
+    zoom_level.zoom_factor = schematic.bin_width
+    zoom_level.ns = URIRef('http://example.org/vg/')
+
+    prev_comp_id = -1
+    cell_counter = 0
+    ocomp_dict = {}
+    obin_dict = {}
+    oposition_dict = {}
+
+    min_bin_id = schematic.first_bin
+    max_bin_id = schematic.last_bin
+
+    for ic, component in enumerate(schematic.components):
+        ocomp = ontology.Component(ic + 1)
+        ocomp.ns = zoom_level.ns_term() + '/'
+        zoom_level.components.append(ocomp)
+
+        # save the sequence 1-2-3-..-n as a bi-directed list
+        if prev_comp_id in ocomp_dict:
+            prev_comp = ocomp_dict[prev_comp_id]
+            ocomp.reverse_component_edge = prev_comp.ns_term()
+            prev_comp.forward_component_edge = ocomp.ns_term()
+
+        ocomp_dict[ic] = ocomp
+        prev_comp_id = ic
+
+        obin_ns = ocomp.ns_term() + '/'
+        obin_tmp = ontology.Bin()
+        obin_tmp.ns = obin_ns
+
+        # bins
+        for bins in component.matrix:
+            for bin in bins[1][1]:  # follow the compressed format
+                if bin:
+                    cur_bin_id = bin.bin_id
+                    obin = ontology.Bin()
+                    obin.ns = obin_ns
+                    obin.bin_rank = cur_bin_id
+                    obin_dict[cur_bin_id] = obin
+
+                    if (cur_bin_id > min_bin_id):
+                        obin_tmp.bin_rank = cur_bin_id - 1
+                        obin.reverse_bin_edge = obin_tmp.ns_term()  # string value
+                    if (cur_bin_id < max_bin_id):
+                        obin_tmp.bin_rank = cur_bin_id + 1
+                        obin.forward_bin_edge = obin_tmp.ns_term()  # string value
+
+                    ocomp.bins.append(obin)
+
+                    cell_counter = cell_counter + 1
+                    ocell = ontology.Cell()
+                    ocell.id = cell_counter
+                    ocell.path_id = schematic.path_names[bin.path_id]  # saved in the populate_component_matrix
+                    ocell.inversion_percent = bin.inversion
+                    ocell.position_percent = bin.position
+
+                    # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
+                    cell_ns = URIRef("{0}/".format(ocell.path_id))
+                    for be in range(0, len(bin.nucleotide_ranges), 2):
+                        begin, end = bin.nucleotide_ranges[be], bin.nucleotide_ranges[be + 1]
+                        real_begin = begin if begin else end
+                        real_end = end if end else begin
+
+                        oregion = ontology.Region()
+                        oregion.begin = real_begin
+                        oregion.end = real_end
+                        ocell.cell_region.append(oregion)
+
+                        path = schematic.path_names[bin.path_id]
+                        oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns)
+                        oposition_end = ontology.Position(real_end, begin < end, path, cell_ns)
+                        oposition_dict[oposition_begin.ns_term()] = oposition_begin
+                        oposition_dict[oposition_end.ns_term()] = oposition_end
+
+                    obin.cells.append(ocell)
+
+    # links between components and their bins
+
+    olink_dict = {}
+    link_counter = 0
+    for component in schematic.components:
+        # search in all arrivals component <-> component links; departures are iterated automatically
+        # every link from departures will be in some other arrival
+        for link in component.departures:
+            if len(link.participants):
+                link_counter = link_counter + 1
+                olink = ontology.Link()
+                olink.id = link_counter
+                olink_dict[link_counter] = olink
+
+    link_counter = 0
+    for component in schematic.components:
+        # search in all arrivals component <-> component links; departures are iterated automatically
+        # every link from departures will be in some other arrival
+        for link in component.departures:
+            if len(link.participants):
+                link_counter = link_counter + 1
+                olink = olink_dict[link_counter]
+
+                if link.upstream in obin_dict:
+                    from_bin = obin_dict[link.upstream]
+                    olink.departure = from_bin.ns_term()
+
+                if link.downstream in obin_dict:
+                    to_bin = obin_dict[link.downstream]
+                    olink.arrival = to_bin.ns_term()
+
+                olink.paths = [schematic.path_names[k] for k in link.participants]
+                olink.linkZoomLevel = zoom_level.ns_term()
+                zoom_level.links.append(olink)
+
+    g = Graph()
+    vg = Namespace('http://biohackathon.org/resource/vg#')
+    faldo = Namespace('http://biohackathon.org/resource/faldo#')
+    g.bind('vg', vg)
+    g.bind('faldo', faldo)
+
+    # here the magic happens
+    zoom_level.add_to_graph(g, vg, faldo)
+    for oposition in oposition_dict.values():
+        oposition.add_to_graph(g, vg, faldo)
+    for path in schematic.path_names:
+        ontology.Path(path).add_to_graph(g, vg, faldo)
+
+    # format='nt' works 10x faster than 'turtle'; the produced files are 3x bigger
+    # do not forget to compress them afterwards - gives 15x diskspace reduction
+    g.serialize(destination=ontology_output_path, format='nt')
+    with open(ontology_output_path, 'rb') as fin:
+        with gzip.open(ontology_output_path + '.gz', 'wb') as fout:
+            shutil.copyfileobj(fin, fout)
+            fout.close()
+    os.remove(ontology_output_path)
+
+
 @dataclass
 class PangenomeSchematic:
     json_version: int
@@ -39,14 +182,7 @@ def dumper(obj):
                     ranges.append([flat_ranges[i], flat_ranges[i+1]])
                 return [obj.coverage, obj.inversion, ranges]
             if isinstance(obj, LinkColumn):
-                # todo: get rid of this once the JS side can work with sparse containers
-                if self.json_version <= 14:
-                    bools = [False] * len(self.path_names)
-                    for i in obj.participants:
-                        bools[i] = True
-                    return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':bools}
-                else:
-                    return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':obj.participants.tolist()}
+                return {'upstream': obj.upstream, 'downstream': obj.downstream, 'participants': obj.participants.tolist()}
             if isinstance(obj, set):
                 return list(obj)
             try:
@@ -70,28 +206,7 @@ def update_first_last_bin(self):
         self.first_bin = 1  # these have not been properly initialized
         self.last_bin = self.components[-1].last_bin
 
-    def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links, ontology_folder):
-        # todo: get rid of this once the JS side can work with sparse containers
-        if self.json_version <= 14:
-            empty = []
-            for comp in self.components:
-                bools = [False] * len(self.path_names)
-                for i in comp.occupants:
-                    bools[i] = True
-                comp.occupants = bools
-
-                matrix = [empty] * len(self.path_names)
-                fb, lb = comp.first_bin, comp.last_bin
-                for item in comp.matrix:
-                    padded = [empty] * (lb - fb + 1)
-                    sliced = item[1]
-                    for id, val in zip(sliced[0], sliced[1]):
-                        padded[id] = val
-                    matrix[item[0]] = padded
-
-                comp.matrix = matrix
-
-
+    def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links, ontology_folder, parallel):
         """Splits one Schematic into multiple files with their own
         unique first and last_bin based on the volume of data desired per
         file specified by cells_per_file.  """
@@ -139,136 +254,20 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                     c = folder.joinpath(schematic.fasta_filename(i))
                     write_contigs_to_file(c, chunk)
 
-                if ontology_folder:
-                    zoom_level = ontology.ZoomLevel()
-                    zoom_level.zoom_factor = schematic.bin_width
-                    zoom_level.ns = URIRef('http://example.org/vg/')
-
-                    prev_comp_id = -1
-                    cell_counter = 0
-                    ocomp_dict = {}
-                    obin_dict = {}
-                    oposition_dict = {}
-
-                    min_bin_id = schematic.first_bin
-                    max_bin_id = schematic.last_bin
-
-                    for ic, component in enumerate(schematic.components):
-                        ocomp = ontology.Component(ic+1)
-                        ocomp.ns = zoom_level.ns_term() + '/'
-                        zoom_level.components.append(ocomp)
-
-                        # save the sequence 1-2-3-..-n as a bi-directed list
-                        if prev_comp_id in ocomp_dict:
-                            prev_comp = ocomp_dict[prev_comp_id]
-                            ocomp.reverse_component_edge = prev_comp.ns_term()
-                            prev_comp.forward_component_edge = ocomp.ns_term()
-
-                        ocomp_dict[ic] = ocomp
-                        prev_comp_id = ic
-
-                        obin_ns = ocomp.ns_term() + '/'
-                        obin_tmp = ontology.Bin()
-                        obin_tmp.ns = obin_ns
-
-                        # bins
-                        for bins in component.matrix:
-                            for bin in bins[1][1]:  # follow the compressed format
-                                if bin:
-                                    cur_bin_id = bin.bin_id
-                                    obin = ontology.Bin()
-                                    obin.ns = obin_ns
-                                    obin.bin_rank = cur_bin_id
-                                    obin_dict[cur_bin_id] = obin
-
-                                    if (cur_bin_id > min_bin_id):
-                                        obin_tmp.bin_rank = cur_bin_id - 1
-                                        obin.reverse_bin_edge = obin_tmp.ns_term()  # string value
-                                    if (cur_bin_id < max_bin_id):
-                                        obin_tmp.bin_rank = cur_bin_id + 1
-                                        obin.forward_bin_edge = obin_tmp.ns_term()  # string value
-
-                                    ocomp.bins.append(obin)
-
-                                    cell_counter = cell_counter + 1
-                                    ocell = ontology.Cell()
-                                    ocell.id = cell_counter
-                                    ocell.path_id = self.path_names[bin.path_id]  # saved in the populate_component_matrix
-                                    ocell.inversion_percent = bin.inversion
-                                    ocell.position_percent = bin.position
-
-                                    # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
-                                    cell_ns = URIRef(ocell.path_id + "/")
-                                    for i in range(0, len(bin.nucleotide_ranges), 2):
-                                        begin, end = bin.nucleotide_ranges[i], bin.nucleotide_ranges[i+1]
-                                        real_begin = begin if begin else end
-                                        real_end = end if end else begin
-
-                                        oregion = ontology.Region()
-                                        oregion.begin = real_begin
-                                        oregion.end = real_end
-                                        ocell.cell_region.append(oregion)
-
-                                        path = self.path_names[bin.path_id]
-                                        oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns)
-                                        oposition_end = ontology.Position(real_end, begin < end, path, cell_ns)
-                                        oposition_dict[oposition_begin.ns_term()] = oposition_begin
-                                        oposition_dict[oposition_end.ns_term()] = oposition_end
-
-                                    obin.cells.append(ocell)
-
-                    # links between components and their bins
-
-                    olink_dict = {}
-                    link_counter = 0
-                    for component in schematic.components:
-                        # search in all arrivals component <-> component links; departures are iterated automatically
-                        # every link from departures will be in some other arrival
-                        for link in component.departures:
-                            if len(link.participants):
-                                link_counter = link_counter + 1
-                                olink = ontology.Link()
-                                olink.id = link_counter
-                                olink_dict[link_counter] = olink
-
-
-                    link_counter = 0
-                    for component in schematic.components:
-                        # search in all arrivals component <-> component links; departures are iterated automatically
-                        # every link from departures will be in some other arrival
-                        for link in component.departures:
-                            if len(link.participants):
-                                link_counter = link_counter + 1
-                                olink = olink_dict[link_counter]
-
-                                if link.upstream in obin_dict:
-                                    from_bin = obin_dict[link.upstream]
-                                    olink.departure = from_bin.ns_term()
-
-                                if link.downstream in obin_dict:
-                                    to_bin = obin_dict[link.downstream]
-                                    olink.arrival = to_bin.ns_term()
-
-                                olink.paths = [self.path_names[k] for k in link.participants]
-                                olink.linkZoomLevel = zoom_level.ns_term()
-                                zoom_level.links.append(olink)
-
-                    g = Graph()
-                    vg = Namespace('http://biohackathon.org/resource/vg#')
-                    faldo = Namespace('http://biohackathon.org/resource/faldo#')
-                    g.bind('vg', vg)
-                    g.bind('faldo', faldo)
-
-                    # here the magic happens
-                    zoom_level.add_to_graph(g, vg, faldo)
-                    for oposition in oposition_dict.values():
-                        oposition.add_to_graph(g, vg, faldo)
-                    for path in self.path_names:
-                        ontology.Path(path).add_to_graph(g, vg, faldo)
-
-                    p = ontology_folder.joinpath(schematic.ttl_filename(i))
-                    g.serialize(destination=str(p), format='turtle', encoding='utf-8')
-
+        if ontology_folder:
+            # generator for the pairs (Schematics, path) - will be instantiated on the item access!
+            prepared_schematics = ( (PangenomeSchematic(JSON_VERSION, self.bin_width, self.components[cut:cut_points[i + 1]][0].first_bin,
+                                                   self.components[cut:cut_points[i + 1]][-1].last_bin, self.includes_connectors,
+                                                   self.components[cut:cut_points[i + 1]], self.path_names,
+                                                   self.total_nr_files, self.pangenome_length),
+                                     str(ontology_folder.joinpath(self.ttl_filename(i))) )
+                                   for i, cut in enumerate(cut_points[:-1]) if self.components[cut:cut_points[i + 1]] )
+
+            if parallel:
+                results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics)
+            else:
+                for (sch, path) in prepared_schematics:
+                    write_rdf(sch, path)
 
         return bin2file_mapping
 
@@ -287,12 +286,7 @@ def find_cut_points_in_file_split(self, columns_per_file, column_counts):
     def lazy_average_occupants(self):
         """grab four random components and check how many occupants they have"""
         samples = [self.components[int(len(self.components) * (perc/100))] for perc in range(1, 99)]
-
-        # todo: get rid of this once the JS side can work with sparse containers
-        if self.json_version <= 14:
-            avg_paths = mean([sum(x.occupants) for x in samples])
-        else:
-            avg_paths = mean([len(x.occupants) for x in samples])
+        avg_paths = mean([len(x.occupants) for x in samples])
 
         return avg_paths
 
@@ -306,7 +300,7 @@ def fasta_filename(self, nth_file):
         return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa'
 
     def ttl_filename(self, nth_file):
-        return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.ttl'
+        return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.nt'
 
     def write_index_file(self, folder, bin2file_mapping):
 
diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py
index d2f3127..b2ba9e9 100644
--- a/matrixcomponent/matrix.py
+++ b/matrixcomponent/matrix.py
@@ -38,10 +38,6 @@ class LinkColumn(recordclass.dataobject):
     downstream: int
     participants: 'numpy.array' # ids of participated path_names
 
-    # todo: not more than 2^32 bins are supported - refactor the ontology code processing the Link items
-    def __hash__(self):
-        return (self.upstream << 32) + self.downstream
-
 
 class Component:
     """Block of co-linear variation within a Graph Matrix
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 8d9b8e3..97479ef 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -9,7 +9,7 @@ def __init__(self, path):
         self.path = path
 
     def ns_term(self):
-        return "path/" + self.path # path1
+        return "path/{0}".format(self.path) # path1
 
     def add_to_graph(self, graph: Graph, vg: Namespace, faldo: Namespace) -> None:
         path = URIRef(self.ns_term())  # str representation
@@ -30,11 +30,8 @@ def __init__(self, id, is_forward, path, ns):
         self.path = path
         self.ns = ns
 
-    def __str__(self):
-        return str(self.id)
-
     def ns_term(self):
-        return self.ns + str(self) # path1/2
+        return self.ns + "{0}".format(str(self.id))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         position = self.ns_term()  # str representation
@@ -56,11 +53,8 @@ class Region:
     begin: int
     end: int
 
-    def __str__(self):
-        return str(self.begin) + "-" + str(self.end)  # 2-10
-
     def ns_term(self):
-        return self.ns + str(self) # path1/...
+        return self.ns + "{0}-{1}".format(str(self.begin), str(self.end))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         region = self.ns_term()  # str representation
@@ -86,15 +80,12 @@ class Cell:
     def __init__(self):
         self.cell_region = []
 
-    def __str__(self):
-        return "cell" + self.path_id # cell1
-
     def ns_term(self):
-        return self.ns + str(self)
+        return self.ns + "cell{0}".format(str(self.path_id))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         cell = self.ns_term()
-        inner_ns = URIRef(self.path_id + "/")
+        inner_ns = URIRef("{0}/".format(self.path_id))
 
         # add the object itself
         graph.add((cell, RDF.type, vg.Cell))
@@ -120,11 +111,8 @@ def __init__(self):
         self.forward_bin_edge = ''
         self.reverse_bin_edge = ''
 
-    def __str__(self):
-        return "bin" + str(self.bin_rank)  # bin1
-
     def ns_term(self):
-        return self.ns + str(self)
+        return self.ns + "bin{0}".format(str(self.bin_rank))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         bin = self.ns_term()
@@ -166,17 +154,11 @@ def __init__(self):
         self.departure = ''
         self.linkZoomLevel = ''
 
-    def __str__(self):
-        return "link" + str(self.id)  # bin1
-
-
     def ns_term(self):
-        return self.ns + str(self)
-
+        return self.ns + "link{0}".format(str(self.id))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         link = self.ns_term()
-        inner_ns = link + "/"
 
         # add the object itself
         graph.add((link, RDF.type, vg.Link))
@@ -197,10 +179,10 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 
         # add the reference to another object if needed
         if self.forward_link_edge > -1:
-            graph.add((link, vg.forwardLinkEdge, self.ns + "link" + str(self.forward_link_edge)))
+            graph.add((link, vg.forwardLinkEdge, "{0}link{1}".format(self.ns, str(self.forward_link_edge))))
 
         if self.reverse_link_edge > -1:
-            graph.add((link, vg.reverseLinkEdge, self.ns + "link" + str(self.reverse_link_edge)))
+            graph.add((link, vg.reverseLinkEdge, "{0}link{1}".format(self.ns, str(self.reverse_link_edge))))
 
 
 class Component:
@@ -217,11 +199,8 @@ def __init__(self, id):
         self.forward_component_edge = ''
         self.reverse_component_edge = ''
 
-    def __str__(self):
-        return "component" + str(self.id) # component1
-
     def ns_term(self):
-        return self.ns + self.__str__()
+        return self.ns + "component{0}".format(str(self.id))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         component = self.ns_term()
@@ -256,11 +235,8 @@ def __init__(self):
         self.components = []
         self.links = []
 
-    def __str__(self):
-        return "zoom" +  str(self.zoom_factor) #zoom1000
-
     def ns_term(self):
-        return self.ns + self.__str__()
+        return self.ns + "zoom{0}".format(str(self.zoom_factor))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         zoomfactor = self.ns_term()
diff --git a/segmentation.py b/segmentation.py
index cb36fd4..b5f3e24 100644
--- a/segmentation.py
+++ b/segmentation.py
@@ -282,7 +282,7 @@ def _split_lines(self, text, width):
         return argparse.HelpFormatter._split_lines(self, text, width)
 
 
-def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic, no_adjacent_links):
+def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic, no_adjacent_links, parallel):
     os.makedirs(folder, exist_ok=True)  # make directory for all files
     if ontology_folder:
         os.makedirs(ontology_folder, exist_ok=True)
@@ -291,7 +291,8 @@ def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeS
     if odgi_fasta:
         fasta = read_contigs(odgi_fasta)[0]
 
-    bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta, no_adjacent_links, ontology_folder)
+    bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta, no_adjacent_links,
+                                                 ontology_folder, parallel)
 
     schematic.write_index_file(folder, bin2file_mapping)
 
@@ -405,9 +406,9 @@ def main():
 
         ontology_folder_path = None
         if args.do_ttl:
-            ontology_folder_path = osPath(args.output_folder).joinpath(path_name + '-turtle')
+            ontology_folder_path = osPath(args.output_folder).joinpath(path_name + '-rdf')
 
-        write_files(folder_path, ontology_folder_path, args.fasta, schematic, args.no_adjacent_links)
+        write_files(folder_path, ontology_folder_path, args.fasta, schematic, args.no_adjacent_links, parallel)
 
         LOGGER.info("Finished processing the file " + json_file)
 

From 46d9c204798b3735021e3f247ffe1123582fa2b2 Mon Sep 17 00:00:00 2001
From: Toshiyuki Yokoyama <6br@users.noreply.github.com>
Date: Sat, 18 Jul 2020 23:40:15 +0900
Subject: [PATCH 18/25] Update requirements.txt

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a40f53d..f2ac401 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ DNASkittleUtils==1.0.13
 numpy==1.18.2
 pytest==5.4.1
 sortedcontainers==2.1.0
-joblib==0.14.1
+joblib==0.1
 numba==0.48.0
 psutil==5.7.0
-recordclass==0.13.2
\ No newline at end of file
+recordclass==0.13.2

From 1f5a379184b9c2427f14a7619bc34ecb39270685 Mon Sep 17 00:00:00 2001
From: 6br <tfdvrt@gmail.com>
Date: Sun, 26 Jul 2020 12:41:07 +0900
Subject: [PATCH 19/25] Add 'path/' on cells

---
 matrixcomponent/ontology.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 97479ef..3922f1d 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -81,11 +81,11 @@ def __init__(self):
         self.cell_region = []
 
     def ns_term(self):
-        return self.ns + "cell{0}".format(str(self.path_id))
+        return self.ns + "cell{0}/path/".format(str(self.path_id))
 
     def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
         cell = self.ns_term()
-        inner_ns = URIRef("{0}/".format(self.path_id))
+        inner_ns = URIRef("{0}/path/".format(self.path_id))
 
         # add the object itself
         graph.add((cell, RDF.type, vg.Cell))

From e014b732b1ff647dc2d5e02f4869f16b4e1bfa69 Mon Sep 17 00:00:00 2001
From: 6br <tfdvrt@gmail.com>
Date: Sun, 26 Jul 2020 13:01:34 +0900
Subject: [PATCH 20/25] Add assertion

---
 matrixcomponent/PangenomeSchematic.py | 1 +
 matrixcomponent/ontology.py           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 489b336..1b137e9 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -124,6 +124,7 @@ def write_rdf(schematic, ontology_output_path):
             if len(link.participants):
                 link_counter = link_counter + 1
                 olink = olink_dict[link_counter]
+                assert link.upstream in obin_dict == link.downstream in obin_dict 
 
                 if link.upstream in obin_dict:
                     from_bin = obin_dict[link.upstream]
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 3922f1d..6d6196b 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -168,6 +168,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 
         if self.linkZoomLevel:
             graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel)))
+        assert self.arrival and self.departure
         if self.arrival:
             graph.add((link, vg.arrival, URIRef(self.arrival)))
 

From 9beafb3acfb5acf44907d3e1b5bf7a5580918037 Mon Sep 17 00:00:00 2001
From: 6br <tfdvrt@gmail.com>
Date: Sun, 26 Jul 2020 13:14:09 +0900
Subject: [PATCH 21/25] Add assertion

---
 matrixcomponent/PangenomeSchematic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 1b137e9..38c0054 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -124,14 +124,14 @@ def write_rdf(schematic, ontology_output_path):
             if len(link.participants):
                 link_counter = link_counter + 1
                 olink = olink_dict[link_counter]
-                assert link.upstream in obin_dict == link.downstream in obin_dict 
+                assert link.upstream in obin_dict == link.downstream in obin_dict, "upstream: {0}, downstream {1}".format(link.upstream, link.downstream)
 
                 if link.upstream in obin_dict:
                     from_bin = obin_dict[link.upstream]
                     olink.departure = from_bin.ns_term()
 
-                if link.downstream in obin_dict:
-                    to_bin = obin_dict[link.downstream]
+                if lin_k.downstream in obin_dict:
+                    tobin = obin_dict[link.downstream]
                     olink.arrival = to_bin.ns_term()
 
                 olink.paths = [schematic.path_names[k] for k in link.participants]

From 8a336d905fbb2d1dd316ed98697dec2d4343cd93 Mon Sep 17 00:00:00 2001
From: 6br <tfdvrt@gmail.com>
Date: Sun, 26 Jul 2020 14:08:47 +0900
Subject: [PATCH 22/25] Add logger info

---
 matrixcomponent/PangenomeSchematic.py | 9 ++++++---
 matrixcomponent/ontology.py           | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 38c0054..9496984 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -124,15 +124,18 @@ def write_rdf(schematic, ontology_output_path):
             if len(link.participants):
                 link_counter = link_counter + 1
                 olink = olink_dict[link_counter]
-                assert link.upstream in obin_dict == link.downstream in obin_dict, "upstream: {0}, downstream {1}".format(link.upstream, link.downstream)
 
                 if link.upstream in obin_dict:
                     from_bin = obin_dict[link.upstream]
                     olink.departure = from_bin.ns_term()
+                else:
+                    LOGGER.info(f"No upstream {link.upstream}")
 
-                if lin_k.downstream in obin_dict:
-                    tobin = obin_dict[link.downstream]
+                if link.downstream in obin_dict:
+                    to_bin = obin_dict[link.downstream]
                     olink.arrival = to_bin.ns_term()
+                else:
+                    LOGGER.info(f"No downstream {link.downstream}")
 
                 olink.paths = [schematic.path_names[k] for k in link.participants]
                 olink.linkZoomLevel = zoom_level.ns_term()
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 6d6196b..d9e7301 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -168,7 +168,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 
         if self.linkZoomLevel:
             graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel)))
-        assert self.arrival and self.departure
+        # assert self.arrival and self.departure
         if self.arrival:
             graph.add((link, vg.arrival, URIRef(self.arrival)))
 

From 61ceb227323812012fa50e0e06294004df64c731 Mon Sep 17 00:00:00 2001
From: 6br <tfdvrt@gmail.com>
Date: Sun, 26 Jul 2020 14:08:47 +0900
Subject: [PATCH 23/25] Register logger

---
 matrixcomponent/PangenomeSchematic.py | 11 ++++++++---
 matrixcomponent/ontology.py           |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 38c0054..08ed5e5 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -22,6 +22,8 @@
 
 from DNASkittleUtils.Contigs import Contig, write_contigs_to_file
 
+LOGGER = logging.getLogger(__name__)
+"""logging.Logger: The logger for this module"""
 
 # move out of the class to be able to use with joblib
 def write_rdf(schematic, ontology_output_path):
@@ -124,15 +126,18 @@ def write_rdf(schematic, ontology_output_path):
             if len(link.participants):
                 link_counter = link_counter + 1
                 olink = olink_dict[link_counter]
-                assert link.upstream in obin_dict == link.downstream in obin_dict, "upstream: {0}, downstream {1}".format(link.upstream, link.downstream)
 
                 if link.upstream in obin_dict:
                     from_bin = obin_dict[link.upstream]
                     olink.departure = from_bin.ns_term()
+                else:
+                    LOGGER.info(f"No upstream {link.upstream}")
 
-                if lin_k.downstream in obin_dict:
-                    tobin = obin_dict[link.downstream]
+                if link.downstream in obin_dict:
+                    to_bin = obin_dict[link.downstream]
                     olink.arrival = to_bin.ns_term()
+                else:
+                    LOGGER.info(f"No downstream {link.downstream}")
 
                 olink.paths = [schematic.path_names[k] for k in link.participants]
                 olink.linkZoomLevel = zoom_level.ns_term()
diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py
index 6d6196b..d9e7301 100644
--- a/matrixcomponent/ontology.py
+++ b/matrixcomponent/ontology.py
@@ -168,7 +168,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None:
 
         if self.linkZoomLevel:
             graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel)))
-        assert self.arrival and self.departure
+        # assert self.arrival and self.departure
         if self.arrival:
             graph.add((link, vg.arrival, URIRef(self.arrival)))
 

From 86eca81d850ba3f84e81f0e72b509824db07a72b Mon Sep 17 00:00:00 2001
From: 6br <tfdvrt@gmail.com>
Date: Sun, 26 Jul 2020 14:54:35 +0900
Subject: [PATCH 24/25] Add bin logger

---
 matrixcomponent/PangenomeSchematic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index 8e433e0..ccdd9d1 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -106,6 +106,7 @@ def write_rdf(schematic, ontology_output_path):
                     obin.cells.append(ocell)
 
     # links between components and their bins
+    LOGGER.info(f"Bin dictionary {obin_dict}")
 
     olink_dict = {}
     link_counter = 0

From f7047679d8a037a3faa0d621fb59856a605fee80 Mon Sep 17 00:00:00 2001
From: 6br <tfdvrt@gmail.com>
Date: Sun, 26 Jul 2020 16:23:35 +0900
Subject: [PATCH 25/25] Disable parallel on rdf writer

---
 matrixcomponent/PangenomeSchematic.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py
index ccdd9d1..dda5ff1 100644
--- a/matrixcomponent/PangenomeSchematic.py
+++ b/matrixcomponent/PangenomeSchematic.py
@@ -271,11 +271,12 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li
                                      str(ontology_folder.joinpath(self.ttl_filename(i))) )
                                    for i, cut in enumerate(cut_points[:-1]) if self.components[cut:cut_points[i + 1]] )
 
-            if parallel:
-                results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics)
-            else:
-                for (sch, path) in prepared_schematics:
-                    write_rdf(sch, path)
+            #TODO() Now parallel is disabled because links overlap across partial pangenomic schematic occurs, though they cannot rescue because the communication across threads is not implemented yet.
+
+            #if parallel:
+            #    results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics)
+            #else:
+            write_rdf(self, str(ontology_folder.joinpath(self.ttl_filename(0))))
 
         return bin2file_mapping