From acb580ffd4481691444740f8ecfa575ae2c722f6 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Tue, 28 Apr 2020 23:58:21 +0200 Subject: [PATCH 01/25] do the data repack into the ontology containers with further RDF 'turtle' format dump --- data/chrk_ath_12samples_10kb.w100000_S.json | 2 +- matrixcomponent/JSONparser.py | 2 +- matrixcomponent/PangenomeSchematic.py | 90 +++++++- matrixcomponent/matrix.py | 1 + matrixcomponent/ontology.py | 230 ++++++++++++++++++++ segmentation.py | 20 +- 6 files changed, 338 insertions(+), 7 deletions(-) create mode 100644 matrixcomponent/ontology.py diff --git a/data/chrk_ath_12samples_10kb.w100000_S.json b/data/chrk_ath_12samples_10kb.w100000_S.json index 3f89fd3..94b34f5 100644 --- a/data/chrk_ath_12samples_10kb.w100000_S.json +++ b/data/chrk_ath_12samples_10kb.w100000_S.json @@ -1,4 +1,4 @@ -{"odgi_version": 10,"bin_width": 100000,"pangenome_length": 210026186} +{"odgi_version": 12,"bin_width": 100000,"pangenome_length": 210026186} {"bin_id":1} {"bin_id":2} {"bin_id":3} diff --git a/matrixcomponent/JSONparser.py b/matrixcomponent/JSONparser.py index f04fc61..a246da8 100644 --- a/matrixcomponent/JSONparser.py +++ b/matrixcomponent/JSONparser.py @@ -37,7 +37,7 @@ def process_path(line=None): if type(ranges) is not list and len(b) >= 6: ranges = [[b[4], b[5]]] - bin = matrix.Bin(b[0], b[1], b[2], ranges) + bin = matrix.Bin(b[0], b[1], b[2], ranges, 0) # path_id = 0 p.bins.setdefault(bin.bin_id, bin) p.links = np.asarray(path['links'], dtype='int32') diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 1f3d11b..0b18e09 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -10,9 +10,11 @@ from dataclasses import dataclass -from matrixcomponent import JSON_VERSION +from matrixcomponent import JSON_VERSION, ontology from matrixcomponent.matrix import Component, Bin, LinkColumn +from rdflib import Graph, Namespace, URIRef + from DNASkittleUtils.Contigs import Contig, write_contigs_to_file @dataclass @@ -50,7 +52,7 @@ def update_first_last_bin(self): self.first_bin = 1 # these have not been properly initialized self.last_bin = self.components[-1].last_bin - def split_and_write(self, cells_per_file, folder, fasta : Contig): + def split_and_write(self, cells_per_file, folder, fasta: Contig, ontology_folder): """Splits one Schematic into multiple files with their own unique first and last_bin based on the volume of data desired per file specified by cells_per_file. """ @@ -96,6 +98,87 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig): "first_bin": schematic.first_bin, "last_bin": schematic.last_bin}) + if ontology_folder: + # going from top to bottom + zoom_level = ontology.ZoomLevel() + zoom_level.zoom_factor = schematic.bin_width + zoom_level.ns = URIRef('pg/') + + cell_counter = 0 + obin_dict = {} + for ic, component in enumerate(schematic.components): + ocomp = ontology.Component(ic + 1) + zoom_level.components.append(ocomp) + + # bins + for bins in component.matrix: + for bin in bins: + if bin: + obin = ontology.Bin() + obin.bin_rank = bin.bin_id + obin.path_id = bin.path_id # saved in the populate_component_matrix + obin_dict[bin.bin_id] = obin + ocomp.bins.append(obin) + + cell_counter = cell_counter + 1 + ocell = ontology.Cell() + ocell.id = cell_counter + ocell.inversion_percent = bin.inversion + ocell.position_percent = bin.coverage + + for [begin, end] in bin.nucleotide_ranges: + oregion = ontology.Region() + oregion.begin = begin + oregion.end = end + ocell.cell_region.append(oregion) + + obin.cells.append(ocell) + + # links between components and their bins + for component, ocomp in zip(schematic.components, zoom_level.components): + # links: arrivals + link_counter = 0 + for link in component.arrivals: + link_counter = link_counter + 1 + olink = ontology.Link() + olink.id = link_counter + + if link.upstream in obin_dict: + olink.departure = obin_dict[link.upstream] + + if link.downstream in obin_dict: + olink.arrival = obin_dict[link.downstream] + + olink.paths = (link.participants + 1).tolist() + ocomp.links.append(olink) + + for link in component.departures: + link_counter = link_counter + 1 + olink = ontology.Link() + olink.id = link_counter + + if link.upstream in obin_dict: + olink.departure = obin_dict[link.upstream] + + if link.downstream in obin_dict: + olink.arrival = obin_dict[link.downstream] + + olink.paths = (link.participants + 1).tolist() + ocomp.links.append(olink) + + + g = Graph() + vg = Namespace('http://biohackathon.org/resource/vg#') + faldo = Namespace('http://biohackathon.org/resource/faldo#') + g.bind('vg', vg) + g.bind('faldo', faldo) + + zoom_level.add_to_graph(g, vg, faldo) # here the magic happens + + p = ontology_folder.joinpath(schematic.ttl_filename(i)) + g.serialize(destination=str(p), format='turtle', encoding='utf-8') + + return bin2file_mapping def find_cut_points_in_file_split(self, bins_per_file, column_counts): @@ -134,6 +217,9 @@ def filename(self, nth_file): def fasta_filename(self, nth_file): return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa' + def ttl_filename(self, nth_file): + return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.ttl' + def write_index_file(self, folder, bin2file_mapping): file_contents = {'bin_width': self.bin_width, diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py index 269753d..b57c1f5 100644 --- a/matrixcomponent/matrix.py +++ b/matrixcomponent/matrix.py @@ -12,6 +12,7 @@ class Bin: coverage: float inversion: float nucleotide_ranges: List[List[int]] + path_id: int sequence: str = '' ## Path is all for input files diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py new file mode 100644 index 0000000..2d070cc --- /dev/null +++ b/matrixcomponent/ontology.py @@ -0,0 +1,230 @@ +from typing import List +from rdflib import Namespace, Graph, Literal, URIRef, RDF + + +class Path: + id: str + ns: URIRef + + def __init__(self): + ns = Namespace("") # should be empty! + + def __str__(self): + return "path" + str(self.id) # path1 + + def ns_term(self): + return self.ns + "/" + str(self) + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + path = self.ns + str(self) # str representation + + # add the object itself +# graph.add((path, RDF.type, faldo.Path)) + + # add its properties, recursively if needed + + +class Region: + ns: URIRef + begin: int + end: int + + def __str__(self): + return "region/" + str(self.begin) + "-" + str(self.end) # region/2-10 + + def ns_term(self): + return self.ns + str(self) # path1/... + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + region = self.ns_term() # str representation + + # add the object itself + graph.add( (region, RDF.type, faldo.Region) ) + + # add its properties, recursively if needed + graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) ) + graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) ) + + +class Cell: + id: int + path_id: int + ns: URIRef + position_percent: float + inversion_percent: float + cell_region: List[Region] # [[1,4], [7,11]] + + def __init__(self): + self.cell_region = [] + + def __str__(self): + return "cell" + str(self.id) # cell1 + + def ns_term(self): + return self.ns + str(self) + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + cell = self.ns_term() + inner_ns = URIRef("path" + str(self.path_id) + "/") + + # add the object itself + graph.add((cell, RDF.type, vg.Cell)) + + # add its properties, recursively if needed + graph.add((cell, vg.positionPercent, Literal(self.position_percent))) + graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent))) + for region in self.cell_region: + region.ns = inner_ns + graph.add((cell, vg.cellRegion, region.ns_term())) # can have multiple bins + region.add_to_graph(graph, vg, faldo) + + +class Bin: + ns: URIRef + bin_rank: int + path_id: int + bin_edge: object # pointer to the next Bin + cells: List[Region] + + def __init__(self): + self.bin_edge = None + self.cells = [] + + def __str__(self): + return "bin" + str(self.bin_rank) # bin1 + + def ns_term(self): + return self.ns + str(self) + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + bin = self.ns_term() + inner_ns = bin + "/" + + # add the object itself + graph.add((bin, RDF.type, vg.Bin)) + + # add its properties, recursively if needed + graph.add((bin, vg.binRank, Literal(self.bin_rank))) + for cell in self.cells: + cell.ns = inner_ns + cell.path_id = self.path_id + graph.add((bin, vg.cells, cell.ns_term())) # can have multiple bins + cell.add_to_graph(graph, vg, faldo) + + # add the reference to another object if needed + if self.bin_edge is not None: + graph.add((inner_ns, vg.binEdge, Literal(self.bin_edge))) # __str__ will be called + + +class Link: + id: int + ns: URIRef + arrival: Bin + departure: Bin + paths: List[Path] + + def __init__(self): + self.paths = [] + self.arrival = None + self.departure = None + + def __str__(self): + return "link" + str(self.id) # bin1 + + + def ns_term(self): + return self.ns + str(self) + + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + link = self.ns_term() + inner_ns = link + "/" + + # add the object itself + graph.add((link, RDF.type, vg.Link)) + + # add its properties, recursively if needed + if self.arrival: + self.arrival.ns = self.ns + graph.add((link, vg.arrival, self.arrival.ns_term())) + + if self.departure: + self.departure.ns = self.ns + graph.add((link, vg.departure, self.departure.ns_term())) + + for path in self.paths: + graph.add((link, vg.linkPaths, URIRef("path" + str(path)))) # can have multiple bins + + +class Component: + id: int + ns: URIRef + component_edge: object # Component + component_rank: int + bins: List[Bin] + links: List[Link] + + def __init__(self, id): + self.id = id + self.component_edge = None + self.bins = [] + self.links = [] + + def __str__(self): + return "component" + str(self.id) # component1 + + def ns_term(self): + return self.ns + self.__str__() + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + component = self.ns_term() + inner_ns = component + "/" + + # add the object itself + graph.add((component, RDF.type, vg.Component)) + + # add its properties, recursively if needed + graph.add((component, vg.componentRank, Literal(self.id))) + for bin in self.bins: + bin.ns = inner_ns + graph.add((component, vg.bins, bin.ns_term())) # can have multiple bins + bin.add_to_graph(graph, vg, faldo) # add the inner content of each + + for link in self.links: + link.ns = inner_ns + link.add_to_graph(graph, vg, faldo) # add the inner content of each + + + # add the reference to another object if needed + if self.component_edge is not None: + graph.add((component, vg.componentEdge, Literal(self.component_edge))) # __str__ will be called + + +class ZoomLevel: + id: str + ns: URIRef + zoom_factor: int + components: List[Component] + + def __init__(self): + self.components = [] + + def __str__(self): + return "zoom" + str(self.zoom_factor) #zoom1000 + + def ns_term(self): + return self.ns + self.__str__() + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + zoomfactor = self.ns_term() + inner_ns = zoomfactor + "/" + + # add the object itself + graph.add((zoomfactor, RDF.type, vg.ZoomLevel)) + + # add its properties, recursively if needed + graph.add((zoomfactor, vg.zoomFactor, Literal(self.zoom_factor))) + for i,comp in enumerate(self.components): + comp.ns = inner_ns + graph.add((zoomfactor, vg.components, comp.ns_term())) + comp.add_to_graph(graph, vg, faldo) diff --git a/segmentation.py b/segmentation.py index c779ddb..1ac535a 100644 --- a/segmentation.py +++ b/segmentation.py @@ -67,6 +67,7 @@ def populate_component_matrix(paths: List[Path], schematic: PangenomeSchematic): # this case enforces first_bin == last_bin --- comp.matrix[p] has a single element for comp, fr in zip(comp_filtered, from_filtered): bin = values[fr] + bin.path_id = p # save for later comp.matrix[p] = [bin] comp.occupants[p] = bin.coverage > 0.1 @@ -287,14 +288,16 @@ def _split_lines(self, text, width): return argparse.HelpFormatter._split_lines(self, text, width) -def write_files(folder, odgi_fasta: Path, schematic: PangenomeSchematic): +def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic): os.makedirs(folder, exist_ok=True) # make directory for all files + if ontology_folder: + os.makedirs(ontology_folder, exist_ok=True) fasta = None if odgi_fasta: fasta = read_contigs(odgi_fasta)[0] - bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta) + bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta, ontology_folder) schematic.write_index_file(folder, bin2file_mapping) @@ -345,6 +348,12 @@ def get_arguments(): type=int, help='Tip: do not set this one to more than available CPU cores)') + parser.add_argument('-t', '--do-ttl', + dest='do_ttl', + default=False, + type=bool, + help='do the ontology turtle output or not)') + args = parser.parse_args() # file path logic for single or list of files with wildcard * @@ -392,7 +401,12 @@ def main(): # this one spits out json and optionally other output files (fasta, ttl) path_name = str(bin_width) folder_path = osPath(args.output_folder).joinpath(path_name) # full path - write_files(folder_path, args.fasta, schematic) + + ontology_folder_path = None + if args.do_ttl: + ontology_folder_path = osPath(args.output_folder).joinpath(path_name + '-turtle') + + write_files(folder_path, ontology_folder_path, args.fasta, schematic) LOGGER.info("Finished processing the file " + json_file) From 44caccc57629ef148db3aaebb575efabd39ed5d6 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Thu, 30 Apr 2020 23:46:54 +0200 Subject: [PATCH 02/25] further ontology fixes: real path names; better forward* and reverse* edges definition for Component and Bin containers; Links are a part of ZoomLevel component --- matrixcomponent/PangenomeSchematic.py | 111 +++++++++++++++++++++++++- matrixcomponent/matrix.py | 4 + matrixcomponent/ontology.py | 100 +++++++++++------------ 3 files changed, 160 insertions(+), 55 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index daa78ed..6df6516 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -10,7 +10,9 @@ from dataclasses import dataclass -from matrixcomponent import JSON_VERSION +from rdflib import URIRef, Graph, Namespace + +from matrixcomponent import JSON_VERSION, ontology from matrixcomponent.matrix import Component, Bin, LinkColumn from DNASkittleUtils.Contigs import Contig, write_contigs_to_file @@ -56,7 +58,7 @@ def update_first_last_bin(self): self.first_bin = 1 # these have not been properly initialized self.last_bin = self.components[-1].last_bin - def split_and_write(self, cells_per_file, folder, fasta : Contig): + def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folder): """Splits one Schematic into multiple files with their own unique first and last_bin based on the volume of data desired per file specified by cells_per_file. """ @@ -100,6 +102,108 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig): c = folder.joinpath(schematic.fasta_filename(i)) write_contigs_to_file(c, chunk) + if ontology_folder: + zoom_level = ontology.ZoomLevel() + zoom_level.zoom_factor = schematic.bin_width + zoom_level.ns = URIRef('pg/') + + prev_comp_id = -1 + cell_counter = 0 + ocomp_dict = {} + obin_dict = {} + for ic, component in enumerate(schematic.components): + ocomp = ontology.Component(ic+1) + ocomp.ns = zoom_level.ns_term() + '/' + zoom_level.components.append(ocomp) + + # save the sequence 1-2-3-..-n as a bi-directed list + if prev_comp_id in ocomp_dict: + prev_comp = ocomp_dict[prev_comp_id] + ocomp.reverse_component_edge = prev_comp.ns_term() + prev_comp.forward_component_edge = ocomp.ns_term() + + ocomp_dict[ic] = ocomp + prev_comp_id = ic + + # bins + for bins in component.matrix: + prev_bin_id = -1 + for bin in bins: + if bin: + cur_bin_id = bin.bin_id + obin = ontology.Bin() + obin.ns = ocomp.ns_term() + '/' + obin.bin_rank = cur_bin_id + obin_dict[cur_bin_id] = obin + + # save the sequence 1-2-3-..-n as a bi-directed list + if prev_bin_id in obin_dict: + prev_bin = obin_dict[prev_bin_id] + prev_bin.forward_bin_edge = obin.ns_term() + obin.reverse_bin_edge = prev_bin.ns_term() + + prev_bin_id = cur_bin_id + ocomp.bins.append(obin) + + cell_counter = cell_counter + 1 + ocell = ontology.Cell() + ocell.id = cell_counter + ocell.path_id = self.path_names[bin.path_id] # saved in the populate_component_matrix + ocell.inversion_percent = bin.inversion + ocell.position_percent = bin.coverage + + # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense + for [begin, end] in bin.nucleotide_ranges: + oregion = ontology.Region() + oregion.begin = begin + oregion.end = end + ocell.cell_region.append(oregion) + + obin.cells.append(ocell) + + # links between components and their bins + + all_links = [] + for component in schematic.components: + # search in both arrivals and departures of the component <-> component links + all_links.extend(component.arrivals + component.departures) + + link_counter = 0 + for link in set(all_links): # no duplications; we use Link.__hash__() here - not nice .. + if len(link.participants): + link_counter = link_counter + 1 + olink = ontology.Link() + olink.id = link_counter + + from_bin = None + to_bin = None + if link.upstream in obin_dict: + from_bin = obin_dict[link.upstream] + from_bin.forward_bin_edge = link.downstream + + if link.downstream in obin_dict: + to_bin = obin_dict[link.downstream] + olink.arrival = to_bin.ns_term() + + if from_bin and to_bin: + from_bin.forward_bin_edge = to_bin.ns_term() + to_bin.reverse_bin_edge = from_bin.ns_term() + + olink.paths = [self.path_names[k] for k in link.participants] + zoom_level.links.append(olink) + + g = Graph() + vg = Namespace('http://biohackathon.org/resource/vg#') + faldo = Namespace('http://biohackathon.org/resource/faldo#') + g.bind('vg', vg) + g.bind('faldo', faldo) + + zoom_level.add_to_graph(g, vg, faldo) # here the magic happens + + p = ontology_folder.joinpath(schematic.ttl_filename(i)) + g.serialize(destination=str(p), format='turtle', encoding='utf-8') + + return bin2file_mapping def find_cut_points_in_file_split(self, columns_per_file, column_counts): @@ -129,6 +233,9 @@ def filename(self, nth_file): def fasta_filename(self, nth_file): return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa' + def ttl_filename(self, nth_file): + return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.ttl' + def write_index_file(self, folder, bin2file_mapping): file_contents = {'bin_width': self.bin_width, diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py index 2b15624..7a14874 100644 --- a/matrixcomponent/matrix.py +++ b/matrixcomponent/matrix.py @@ -36,6 +36,10 @@ class LinkColumn: num_paths: int participants: 'numpy.array' # ids of participated path_names + # todo: not more than 2^32 bins are supported - refactor the ontology code processing the Link items + def __hash__(self): + return (self.upstream << 32) + self.downstream + class Component: """Block of co-linear variation within a Graph Matrix diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 2d070cc..3a3299b 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -2,28 +2,6 @@ from rdflib import Namespace, Graph, Literal, URIRef, RDF -class Path: - id: str - ns: URIRef - - def __init__(self): - ns = Namespace("") # should be empty! - - def __str__(self): - return "path" + str(self.id) # path1 - - def ns_term(self): - return self.ns + "/" + str(self) - - def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: - path = self.ns + str(self) # str representation - - # add the object itself -# graph.add((path, RDF.type, faldo.Path)) - - # add its properties, recursively if needed - - class Region: ns: URIRef begin: int @@ -48,7 +26,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: class Cell: id: int - path_id: int + path_id: str ns: URIRef position_percent: float inversion_percent: float @@ -65,7 +43,7 @@ def ns_term(self): def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: cell = self.ns_term() - inner_ns = URIRef("path" + str(self.path_id) + "/") + inner_ns = URIRef(self.path_id + "/") # add the object itself graph.add((cell, RDF.type, vg.Cell)) @@ -82,13 +60,14 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: class Bin: ns: URIRef bin_rank: int - path_id: int - bin_edge: object # pointer to the next Bin + forward_bin_edge: str + reverse_bin_edge: str cells: List[Region] def __init__(self): - self.bin_edge = None self.cells = [] + self.forward_bin_edge = '' + self.reverse_bin_edge = '' def __str__(self): return "bin" + str(self.bin_rank) # bin1 @@ -107,26 +86,32 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: graph.add((bin, vg.binRank, Literal(self.bin_rank))) for cell in self.cells: cell.ns = inner_ns - cell.path_id = self.path_id graph.add((bin, vg.cells, cell.ns_term())) # can have multiple bins cell.add_to_graph(graph, vg, faldo) # add the reference to another object if needed - if self.bin_edge is not None: - graph.add((inner_ns, vg.binEdge, Literal(self.bin_edge))) # __str__ will be called + if self.forward_bin_edge: + graph.add((bin, vg.forwardBinEdge, URIRef(self.forward_bin_edge))) + + if self.reverse_bin_edge: + graph.add((bin, vg.reverseBinEdge, URIRef(self.reverse_bin_edge))) class Link: id: int ns: URIRef - arrival: Bin - departure: Bin - paths: List[Path] + arrival: str + departure: str + paths: List[str] + forward_link_edge: int + reverse_link_edge: int def __init__(self): self.paths = [] - self.arrival = None - self.departure = None + self.forward_link_edge = -1 + self.reverse_link_edge = -1 + self.arrival = '' + self.departure = '' def __str__(self): return "link" + str(self.id) # bin1 @@ -145,30 +130,35 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: # add its properties, recursively if needed if self.arrival: - self.arrival.ns = self.ns - graph.add((link, vg.arrival, self.arrival.ns_term())) + graph.add((link, vg.arrival, URIRef(self.arrival))) if self.departure: - self.departure.ns = self.ns - graph.add((link, vg.departure, self.departure.ns_term())) + graph.add((link, vg.departure, URIRef(self.departure))) for path in self.paths: - graph.add((link, vg.linkPaths, URIRef("path" + str(path)))) # can have multiple bins + graph.add((link, vg.linkPaths, URIRef(path))) # can have multiple bins + + # add the reference to another object if needed + if self.forward_link_edge > -1: + graph.add((link, vg.forwardLinkEdge, self.ns + "link" + str(self.forward_link_edge))) + + if self.reverse_link_edge > -1: + graph.add((link, vg.reverseLinkEdge, self.ns + "link" + str(self.reverse_link_edge))) class Component: id: int ns: URIRef - component_edge: object # Component + forward_component_edge: str # id of the next Component + reverse_component_edge: str # id of the previous Component component_rank: int bins: List[Bin] - links: List[Link] def __init__(self, id): self.id = id - self.component_edge = None self.bins = [] - self.links = [] + self.forward_component_edge = '' + self.reverse_component_edge = '' def __str__(self): return "component" + str(self.id) # component1 @@ -187,17 +177,15 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: graph.add((component, vg.componentRank, Literal(self.id))) for bin in self.bins: bin.ns = inner_ns - graph.add((component, vg.bins, bin.ns_term())) # can have multiple bins - bin.add_to_graph(graph, vg, faldo) # add the inner content of each - - for link in self.links: - link.ns = inner_ns - link.add_to_graph(graph, vg, faldo) # add the inner content of each - + graph.add((component, vg.bins, bin.ns_term())) # can have multiple bins + bin.add_to_graph(graph, vg, faldo) # add the inner content of each # add the reference to another object if needed - if self.component_edge is not None: - graph.add((component, vg.componentEdge, Literal(self.component_edge))) # __str__ will be called + if self.forward_component_edge: + graph.add((component, vg.forwardComponentEdge, URIRef(self.forward_component_edge))) + + if self.reverse_component_edge: + graph.add((component, vg.reverseComponentEdge, URIRef(self.reverse_component_edge))) class ZoomLevel: @@ -205,9 +193,11 @@ class ZoomLevel: ns: URIRef zoom_factor: int components: List[Component] + links: List[Link] def __init__(self): self.components = [] + self.links = [] def __str__(self): return "zoom" + str(self.zoom_factor) #zoom1000 @@ -228,3 +218,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: comp.ns = inner_ns graph.add((zoomfactor, vg.components, comp.ns_term())) comp.add_to_graph(graph, vg, faldo) + + for link in self.links: + link.ns = inner_ns + link.add_to_graph(graph, vg, faldo) From 29550fdd10617c76e8c5e25e5cb6feb74067e916 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Mon, 11 May 2020 16:01:39 +0200 Subject: [PATCH 03/25] - do not forget to store the path_id - use faldo.ExactPosition when appropriate --- matrixcomponent/ontology.py | 22 +++++++++++++++------- segmentation.py | 1 + 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 3a3299b..00bf6f9 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -8,7 +8,10 @@ class Region: end: int def __str__(self): - return "region/" + str(self.begin) + "-" + str(self.end) # region/2-10 + suffix = str(self.begin) # region/2 + if self.begin != self.end: + suffix = suffix + "-" + str(self.end) # region/2-10 + return "region/" + suffix def ns_term(self): return self.ns + str(self) # path1/... @@ -16,12 +19,17 @@ def ns_term(self): def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: region = self.ns_term() # str representation - # add the object itself - graph.add( (region, RDF.type, faldo.Region) ) + if self.begin == self.end: + # add the object itself + graph.add((region, RDF.type, faldo.ExactPosition)) + graph.add((region, faldo.position, Literal(self.begin))) + else: + # add the object itself + graph.add( (region, RDF.type, faldo.Region) ) - # add its properties, recursively if needed - graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) ) - graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) ) + # add its properties, recursively if needed + graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) ) + graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) ) class Cell: @@ -36,7 +44,7 @@ def __init__(self): self.cell_region = [] def __str__(self): - return "cell" + str(self.id) # cell1 + return "cell" + self.path_id # cell1 def ns_term(self): return self.ns + str(self) diff --git a/segmentation.py b/segmentation.py index 3f5baf6..7fc3046 100644 --- a/segmentation.py +++ b/segmentation.py @@ -86,6 +86,7 @@ def populate_component_matrix(paths: List[Path], schematic: PangenomeSchematic): sliced = values[fr:to] for bin in sliced: padded[bin.bin_id - fb] = bin # do not create objects, simply link them + bin.path_id = p # save for later comp.matrix[p] = padded comp.occupants[p] = any([bin.coverage > 0.1 for bin in sliced]) From b8d0e162ab424ea953d5a4836886898ec802c1a2 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Wed, 13 May 2020 00:07:53 +0200 Subject: [PATCH 04/25] - explicit faldo:ExactPosition containers - every Link has linkRank numbered after the pair sort (component.id, [component.departures]) --- matrixcomponent/PangenomeSchematic.py | 57 ++++++++++++++++----------- matrixcomponent/ontology.py | 48 +++++++++++++++------- 2 files changed, 68 insertions(+), 37 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 6df6516..acbd38f 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -111,6 +111,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folde cell_counter = 0 ocomp_dict = {} obin_dict = {} + oposition_dict = {} for ic, component in enumerate(schematic.components): ocomp = ontology.Component(ic+1) ocomp.ns = zoom_level.ns_term() + '/' @@ -153,44 +154,54 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folde ocell.position_percent = bin.coverage # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense + cell_ns = URIRef(ocell.path_id + "/") for [begin, end] in bin.nucleotide_ranges: oregion = ontology.Region() oregion.begin = begin oregion.end = end ocell.cell_region.append(oregion) + oposition_begin = ontology.Position(begin, cell_ns) + oposition_end = ontology.Position(end, cell_ns) + oposition_dict[oposition_begin.ns_term()] = oposition_begin + oposition_dict[oposition_end.ns_term()] = oposition_end + obin.cells.append(ocell) # links between components and their bins - all_links = [] + olink_dict = {} + link_counter = 0 for component in schematic.components: - # search in both arrivals and departures of the component <-> component links - all_links.extend(component.arrivals + component.departures) + # search in all arrivals component <-> component links; departures are iterated automatically + # every link from departures will be in some other arrival + for link in component.departures: + if len(link.participants): + link_counter = link_counter + 1 + olink = ontology.Link() + olink.id = link_counter + olink_dict[link_counter] = olink - link_counter = 0 - for link in set(all_links): # no duplications; we use Link.__hash__() here - not nice .. - if len(link.participants): - link_counter = link_counter + 1 - olink = ontology.Link() - olink.id = link_counter - from_bin = None - to_bin = None - if link.upstream in obin_dict: - from_bin = obin_dict[link.upstream] - from_bin.forward_bin_edge = link.downstream + link_counter = 0 + for component in schematic.components: + # search in all arrivals component <-> component links; departures are iterated automatically + # every link from departures will be in some other arrival + for link in component.departures: + if len(link.participants): + link_counter = link_counter + 1 + olink = olink_dict[link_counter] - if link.downstream in obin_dict: - to_bin = obin_dict[link.downstream] - olink.arrival = to_bin.ns_term() + if link.upstream in obin_dict: + from_bin = obin_dict[link.upstream] + olink.departure = from_bin.ns_term() - if from_bin and to_bin: - from_bin.forward_bin_edge = to_bin.ns_term() - to_bin.reverse_bin_edge = from_bin.ns_term() + if link.downstream in obin_dict: + to_bin = obin_dict[link.downstream] + olink.arrival = to_bin.ns_term() - olink.paths = [self.path_names[k] for k in link.participants] - zoom_level.links.append(olink) + olink.paths = [self.path_names[k] for k in link.participants] + zoom_level.links.append(olink) g = Graph() vg = Namespace('http://biohackathon.org/resource/vg#') @@ -199,6 +210,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folde g.bind('faldo', faldo) zoom_level.add_to_graph(g, vg, faldo) # here the magic happens + for oposition in oposition_dict.values(): + oposition.add_to_graph(g, vg, faldo) p = ontology_folder.joinpath(schematic.ttl_filename(i)) g.serialize(destination=str(p), format='turtle', encoding='utf-8') diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 00bf6f9..402d41e 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -2,16 +2,37 @@ from rdflib import Namespace, Graph, Literal, URIRef, RDF +class Position: + id: int + ns: URIRef + + def __init__(self, id, ns): + self.id = id + self.ns = ns + + def __str__(self): + return str(self.id) + + def ns_term(self): + return self.ns + str(self) # path1/2 + + def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: + position = self.ns_term() # str representation + + # add the object itself + graph.add((position, RDF.type, faldo.ExactPosition)) + + # add its properties, recursively if needed + graph.add((position, faldo.position, Literal(self.id))) + + class Region: ns: URIRef begin: int end: int def __str__(self): - suffix = str(self.begin) # region/2 - if self.begin != self.end: - suffix = suffix + "-" + str(self.end) # region/2-10 - return "region/" + suffix + return str(self.begin) + "-" + str(self.end) # 2-10 def ns_term(self): return self.ns + str(self) # path1/... @@ -19,17 +40,12 @@ def ns_term(self): def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: region = self.ns_term() # str representation - if self.begin == self.end: - # add the object itself - graph.add((region, RDF.type, faldo.ExactPosition)) - graph.add((region, faldo.position, Literal(self.begin))) - else: - # add the object itself - graph.add( (region, RDF.type, faldo.Region) ) + # add the object itself + graph.add((region, RDF.type, faldo.Region)) - # add its properties, recursively if needed - graph.add( (region, faldo.begin, self.ns + "position/" + str(self.begin)) ) - graph.add( (region, faldo.end, self.ns + "position/" + str(self.end)) ) + # add its properties, recursively if needed + graph.add((region, faldo.begin, self.ns + str(self.begin))) + graph.add((region, faldo.end, self.ns + str(self.end))) class Cell: @@ -61,7 +77,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent))) for region in self.cell_region: region.ns = inner_ns - graph.add((cell, vg.cellRegion, region.ns_term())) # can have multiple bins + graph.add((cell, vg.cellRegions, region.ns_term())) # can have multiple regions region.add_to_graph(graph, vg, faldo) @@ -137,6 +153,8 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: graph.add((link, RDF.type, vg.Link)) # add its properties, recursively if needed + graph.add((link, vg.linkRank, Literal(self.id))) + if self.arrival: graph.add((link, vg.arrival, URIRef(self.arrival))) From 26e69015b3b20397801e601a0c898f99cdf4f74a Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Tue, 16 Jun 2020 11:48:53 +0200 Subject: [PATCH 05/25] write faldo:ForwardStrandPosition, faldo:position and faldo:reference in each Position. Add Path write out --- matrixcomponent/PangenomeSchematic.py | 22 ++++++++++++------ matrixcomponent/ontology.py | 33 ++++++++++++++++++++++++--- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index f401d7f..db9777e 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -166,7 +166,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li # bins for bins in component.matrix: prev_bin_id = -1 - for bin in bins: + for bin in bins[1][1]: # follow the compressed format if bin: cur_bin_id = bin.bin_id obin = ontology.Bin() @@ -192,14 +192,19 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense cell_ns = URIRef(ocell.path_id + "/") - for [begin, end] in bin.nucleotide_ranges: + for i in range(0, len(bin.nucleotide_ranges), 2): + begin, end = bin.nucleotide_ranges[i], bin.nucleotide_ranges[i+1] + real_begin = begin if begin else end + real_end = end if end else begin + oregion = ontology.Region() - oregion.begin = begin - oregion.end = end + oregion.begin = real_begin + oregion.end = real_end ocell.cell_region.append(oregion) - oposition_begin = ontology.Position(begin, cell_ns) - oposition_end = ontology.Position(end, cell_ns) + path = self.path_names[bin.path_id] + oposition_begin = ontology.Position(real_begin, begin <= end, path, cell_ns) + oposition_end = ontology.Position(real_end, begin <= end, path, cell_ns) oposition_dict[oposition_begin.ns_term()] = oposition_begin oposition_dict[oposition_end.ns_term()] = oposition_end @@ -246,9 +251,12 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li g.bind('vg', vg) g.bind('faldo', faldo) - zoom_level.add_to_graph(g, vg, faldo) # here the magic happens + # here the magic happens + zoom_level.add_to_graph(g, vg, faldo) for oposition in oposition_dict.values(): oposition.add_to_graph(g, vg, faldo) + for path in self.path_names: + ontology.Path(path).add_to_graph(g, vg, faldo) p = ontology_folder.joinpath(schematic.ttl_filename(i)) g.serialize(destination=str(p), format='turtle', encoding='utf-8') diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 402d41e..487a7ce 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -2,12 +2,32 @@ from rdflib import Namespace, Graph, Literal, URIRef, RDF +class Path: + path: str + + def __init__(self, path): + self.path = path + + def ns_term(self): + return self.path # path1 + + def add_to_graph(self, graph: Graph, vg: Namespace, faldo: Namespace) -> None: + path = URIRef(self.ns_term()) # str representation + + # add the object itself + graph.add((path, RDF.type, vg.Path)) + + class Position: id: int + is_forward: bool + path: str ns: URIRef - def __init__(self, id, ns): + def __init__(self, id, is_forward, path, ns): self.id = id + self.is_forward = is_forward + self.path = path self.ns = ns def __str__(self): @@ -21,9 +41,14 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: # add the object itself graph.add((position, RDF.type, faldo.ExactPosition)) + if (self.is_forward): + graph.add((position, RDF.type, faldo.ForwardStrandPosition)) + else: + graph.add((position, RDF.type, faldo.BackwardStrandPosition)) # add its properties, recursively if needed graph.add((position, faldo.position, Literal(self.id))) + graph.add((position, faldo.reference, URIRef(self.path))) class Region: @@ -44,8 +69,10 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: graph.add((region, RDF.type, faldo.Region)) # add its properties, recursively if needed - graph.add((region, faldo.begin, self.ns + str(self.begin))) - graph.add((region, faldo.end, self.ns + str(self.end))) + real_begin = self.begin if self.begin else self.end + real_end = self.end if self.end else self.begin + graph.add((region, faldo.begin, self.ns + str(real_begin))) + graph.add((region, faldo.end, self.ns + str(real_end))) class Cell: From db95d1d4f03d1c65710afd552db5a3f6fa3f63d8 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Tue, 16 Jun 2020 14:57:34 +0200 Subject: [PATCH 06/25] create ontology folder when needed --- segmentation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/segmentation.py b/segmentation.py index e144a3c..cb36fd4 100644 --- a/segmentation.py +++ b/segmentation.py @@ -284,6 +284,8 @@ def _split_lines(self, text, width): def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic, no_adjacent_links): os.makedirs(folder, exist_ok=True) # make directory for all files + if ontology_folder: + os.makedirs(ontology_folder, exist_ok=True) fasta = None if odgi_fasta: From fb1a3ec36e881bca2ac8c46a0ba49cfbd8689ac7 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Tue, 16 Jun 2020 16:06:14 +0200 Subject: [PATCH 07/25] proper name and strand directions --- matrixcomponent/PangenomeSchematic.py | 4 ++-- matrixcomponent/ontology.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index db9777e..9227946 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -203,8 +203,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li ocell.cell_region.append(oregion) path = self.path_names[bin.path_id] - oposition_begin = ontology.Position(real_begin, begin <= end, path, cell_ns) - oposition_end = ontology.Position(real_end, begin <= end, path, cell_ns) + oposition_begin = ontology.Position(real_begin, begin <= end | end == 0, path, cell_ns) + oposition_end = ontology.Position(real_end, begin <= end | end == 0, path, cell_ns) oposition_dict[oposition_begin.ns_term()] = oposition_begin oposition_dict[oposition_end.ns_term()] = oposition_end diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 487a7ce..7b1a68b 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -44,7 +44,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: if (self.is_forward): graph.add((position, RDF.type, faldo.ForwardStrandPosition)) else: - graph.add((position, RDF.type, faldo.BackwardStrandPosition)) + graph.add((position, RDF.type, faldo.ReverseStrandPosition)) # add its properties, recursively if needed graph.add((position, faldo.position, Literal(self.id))) From 8e233eca77f358528ceef2ece19c6182580c15b0 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Tue, 16 Jun 2020 16:22:16 +0200 Subject: [PATCH 08/25] directions --- matrixcomponent/PangenomeSchematic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 9227946..d616139 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -203,8 +203,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li ocell.cell_region.append(oregion) path = self.path_names[bin.path_id] - oposition_begin = ontology.Position(real_begin, begin <= end | end == 0, path, cell_ns) - oposition_end = ontology.Position(real_end, begin <= end | end == 0, path, cell_ns) + oposition_begin = ontology.Position(real_begin, begin >= end, path, cell_ns) + oposition_end = ontology.Position(real_end, begin >= end, path, cell_ns) oposition_dict[oposition_begin.ns_term()] = oposition_begin oposition_dict[oposition_end.ns_term()] = oposition_end From d0bff9ade29f84e33875e38988dc5c980aad9d2e Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 17 Jun 2020 21:21:58 +0200 Subject: [PATCH 09/25] fix bin_edge in .ttl output; but might be slow --- matrixcomponent/PangenomeSchematic.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index d616139..7229470 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -149,6 +149,10 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li ocomp_dict = {} obin_dict = {} oposition_dict = {} + + min_bin_id = schematic.first_bin + max_bin_id = schematic.last_bin + for ic, component in enumerate(schematic.components): ocomp = ontology.Component(ic+1) ocomp.ns = zoom_level.ns_term() + '/' @@ -165,7 +169,6 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li # bins for bins in component.matrix: - prev_bin_id = -1 for bin in bins[1][1]: # follow the compressed format if bin: cur_bin_id = bin.bin_id @@ -174,13 +177,17 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li obin.bin_rank = cur_bin_id obin_dict[cur_bin_id] = obin - # save the sequence 1-2-3-..-n as a bi-directed list - if prev_bin_id in obin_dict: - prev_bin = obin_dict[prev_bin_id] - prev_bin.forward_bin_edge = obin.ns_term() + if (cur_bin_id > min_bin_id): + prev_bin = ontology.Bin() + prev_bin.ns = ocomp.ns_term() + '/' + prev_bin.bin_rank = cur_bin_id - 1 obin.reverse_bin_edge = prev_bin.ns_term() + if (cur_bin_id < max_bin_id): + next_bin = ontology.Bin() + next_bin.ns = ocomp.ns_term() + '/' + next_bin.bin_rank = cur_bin_id + 1 + obin.forward_bin_edge = next_bin.ns_term() - prev_bin_id = cur_bin_id ocomp.bins.append(obin) cell_counter = cell_counter + 1 From 3741e10314486f1eb42ad37a3383220af3ec0968 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Thu, 18 Jun 2020 10:23:33 +0200 Subject: [PATCH 10/25] recycle temp objects, do not create new ones --- matrixcomponent/PangenomeSchematic.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 7229470..f7199ab 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -167,26 +167,26 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li ocomp_dict[ic] = ocomp prev_comp_id = ic + obin_ns = ocomp.ns_term() + '/' + obin_tmp = ontology.Bin() + obin_tmp.ns = obin_ns + # bins for bins in component.matrix: for bin in bins[1][1]: # follow the compressed format if bin: cur_bin_id = bin.bin_id obin = ontology.Bin() - obin.ns = ocomp.ns_term() + '/' + obin.ns = obin_ns obin.bin_rank = cur_bin_id obin_dict[cur_bin_id] = obin if (cur_bin_id > min_bin_id): - prev_bin = ontology.Bin() - prev_bin.ns = ocomp.ns_term() + '/' - prev_bin.bin_rank = cur_bin_id - 1 - obin.reverse_bin_edge = prev_bin.ns_term() + obin_tmp.bin_rank = cur_bin_id - 1 + obin.reverse_bin_edge = obin_tmp.ns_term() # string value if (cur_bin_id < max_bin_id): - next_bin = ontology.Bin() - next_bin.ns = ocomp.ns_term() + '/' - next_bin.bin_rank = cur_bin_id + 1 - obin.forward_bin_edge = next_bin.ns_term() + obin_tmp.bin_rank = cur_bin_id + 1 + obin.forward_bin_edge = obin_tmp.ns_term() # string value ocomp.bins.append(obin) From d50210bbceb105ed230db4dc8dcf356ede838c2d Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Thu, 18 Jun 2020 17:58:12 +0200 Subject: [PATCH 11/25] positionPercent and inversionPercent are printed as doubles --- matrixcomponent/ontology.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 7b1a68b..0f7833d 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -1,5 +1,5 @@ from typing import List -from rdflib import Namespace, Graph, Literal, URIRef, RDF +from rdflib import Namespace, Graph, Literal, URIRef, RDF, XSD class Path: @@ -100,8 +100,8 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: graph.add((cell, RDF.type, vg.Cell)) # add its properties, recursively if needed - graph.add((cell, vg.positionPercent, Literal(self.position_percent))) - graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent))) + graph.add((cell, vg.positionPercent, Literal(self.position_percent, datatype=XSD.double))) + graph.add((cell, vg.inversionPercent, Literal(self.inversion_percent, datatype=XSD.double))) for region in self.cell_region: region.ns = inner_ns graph.add((cell, vg.cellRegions, region.ns_term())) # can have multiple regions From 3f4984c1ab56e1424cbf8d674e2b9cbc5b790e29 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 22 Jun 2020 14:37:03 +0200 Subject: [PATCH 12/25] fix orientation --- matrixcomponent/PangenomeSchematic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index f7199ab..b5e403d 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -210,8 +210,8 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li ocell.cell_region.append(oregion) path = self.path_names[bin.path_id] - oposition_begin = ontology.Position(real_begin, begin >= end, path, cell_ns) - oposition_end = ontology.Position(real_end, begin >= end, path, cell_ns) + oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns) + oposition_end = ontology.Position(real_end, begin < end, path, cell_ns) oposition_dict[oposition_begin.ns_term()] = oposition_begin oposition_dict[oposition_end.ns_term()] = oposition_end From a96974e4317b0febb532a9da8516a5b13d946ee7 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 24 Jun 2020 17:49:25 +0200 Subject: [PATCH 13/25] add base IRI --- matrixcomponent/PangenomeSchematic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index b5e403d..0aeabc0 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -142,7 +142,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li if ontology_folder: zoom_level = ontology.ZoomLevel() zoom_level.zoom_factor = schematic.bin_width - zoom_level.ns = URIRef('pg/') + zoom_level.ns = URIRef('http://example.org/pg/') prev_comp_id = -1 cell_counter = 0 From 350e85d64f8a5d3b888c8fffde5d039fa01573ea Mon Sep 17 00:00:00 2001 From: subwaystation Date: Wed, 24 Jun 2020 22:52:40 +0200 Subject: [PATCH 14/25] added example SPARQL queries --- queries/selectBins1To5OfZoomlevel1.rq | 24 +++++++++++++++++++ .../selectLinksFilterBins1To5OfZoomLevel1.rq | 15 ++++++++++++ queries/selectPangenomeSeq1To5.rq | 9 +++++++ 3 files changed, 48 insertions(+) create mode 100644 queries/selectBins1To5OfZoomlevel1.rq create mode 100644 queries/selectLinksFilterBins1To5OfZoomLevel1.rq create mode 100644 queries/selectPangenomeSeq1To5.rq diff --git a/queries/selectBins1To5OfZoomlevel1.rq b/queries/selectBins1To5OfZoomlevel1.rq new file mode 100644 index 0000000..8e71e99 --- /dev/null +++ b/queries/selectBins1To5OfZoomlevel1.rq @@ -0,0 +1,24 @@ +PREFIX vg: +PREFIX faldo: +PREFIX rdfs: + +SELECT ?cell ?cellregion ?inversionpercent ?positionpercent ?beginpos ?endpos ?path +WHERE { + ?zoomlevel a vg:ZoomLevel; + vg:components ?components; + vg:zoomFactor ?zoomfactor . + FILTER(?zoomfactor = 1) + ?components vg:bins ?bin . + ?bin vg:cells ?cell; + vg:binRank ?binrank . + FILTER(?binrank < 6 && ?binrank > 0) + ?cell vg:cellRegions ?faldoregion; + vg:inversionPercent ?inversionpercent; + vg:positionPercent ?positionpercent . + ?faldoregion faldo:begin ?begin; + faldo:end ?end . + ?begin faldo:position ?beginpos . + ?end faldo:position ?endpos . + ?begin faldo:reference ?reference . + # THIS DOES NOT WORK BUT I DON'T KNOW WHY ?reference rdfs:label ?path . +} \ No newline at end of file diff --git a/queries/selectLinksFilterBins1To5OfZoomLevel1.rq b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq new file mode 100644 index 0000000..ab2cb21 --- /dev/null +++ b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq @@ -0,0 +1,15 @@ +PREFIX vg: +PREFIX faldo: +PREFIX rdfs: + +SELECT * +WHERE { + ?link a vg:Link; + vg:arrival ?arrivalbin; + vg:departure ?departurebin; + vg:linkPaths ?path . + # THIS DOES NOT WORK AND I DON'T KNOW WHY ?path rdfs:label ?l . + ?arrivalbin vg:binRank ?arrivalbinrank . + ?departurebin vg:binRank ?departurebinrank . + FILTER((?arrivalbinrank < 6 && ?arrivalbinrank > 0) || (?departurebinrank < 6 && ?departurebinrank > 0)) +} \ No newline at end of file diff --git a/queries/selectPangenomeSeq1To5.rq b/queries/selectPangenomeSeq1To5.rq new file mode 100644 index 0000000..464c299 --- /dev/null +++ b/queries/selectPangenomeSeq1To5.rq @@ -0,0 +1,9 @@ +PREFIX vg: +PREFIX rdf: +SELECT (SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq) +{SELECT + * #(SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq) +WHERE {?s a vg:Node; + rdf:value ?sequence . + } +ORDER BY ?s \ No newline at end of file From d2eb34c6f0e17f93587752755979dd0115009804 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Fri, 26 Jun 2020 14:25:51 +0200 Subject: [PATCH 15/25] Link -> ZoomLevel, replace pg with vg, add 'path/' --- matrixcomponent/PangenomeSchematic.py | 3 ++- matrixcomponent/ontology.py | 6 +++++- queries/selectBins1To5OfZoomlevel1.rq | 1 - queries/selectLinksFilterBins1To5OfZoomLevel1.rq | 11 +++++++---- queries/selectPangenomeSeq1To5.rq | 16 +++++++++------- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 0aeabc0..85bf5d8 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -142,7 +142,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li if ontology_folder: zoom_level = ontology.ZoomLevel() zoom_level.zoom_factor = schematic.bin_width - zoom_level.ns = URIRef('http://example.org/pg/') + zoom_level.ns = URIRef('http://example.org/vg/') prev_comp_id = -1 cell_counter = 0 @@ -250,6 +250,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li olink.arrival = to_bin.ns_term() olink.paths = [self.path_names[k] for k in link.participants] + olink.linkZoomLevel = zoom_level.ns_term() zoom_level.links.append(olink) g = Graph() diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 0f7833d..8d9b8e3 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -9,7 +9,7 @@ def __init__(self, path): self.path = path def ns_term(self): - return self.path # path1 + return "path/" + self.path # path1 def add_to_graph(self, graph: Graph, vg: Namespace, faldo: Namespace) -> None: path = URIRef(self.ns_term()) # str representation @@ -156,6 +156,7 @@ class Link: paths: List[str] forward_link_edge: int reverse_link_edge: int + linkZoomLevel: str def __init__(self): self.paths = [] @@ -163,6 +164,7 @@ def __init__(self): self.reverse_link_edge = -1 self.arrival = '' self.departure = '' + self.linkZoomLevel = '' def __str__(self): return "link" + str(self.id) # bin1 @@ -182,6 +184,8 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: # add its properties, recursively if needed graph.add((link, vg.linkRank, Literal(self.id))) + if self.linkZoomLevel: + graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel))) if self.arrival: graph.add((link, vg.arrival, URIRef(self.arrival))) diff --git a/queries/selectBins1To5OfZoomlevel1.rq b/queries/selectBins1To5OfZoomlevel1.rq index 8e71e99..a5c1f5c 100644 --- a/queries/selectBins1To5OfZoomlevel1.rq +++ b/queries/selectBins1To5OfZoomlevel1.rq @@ -20,5 +20,4 @@ WHERE { ?begin faldo:position ?beginpos . ?end faldo:position ?endpos . ?begin faldo:reference ?reference . - # THIS DOES NOT WORK BUT I DON'T KNOW WHY ?reference rdfs:label ?path . } \ No newline at end of file diff --git a/queries/selectLinksFilterBins1To5OfZoomLevel1.rq b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq index ab2cb21..f76b6a3 100644 --- a/queries/selectLinksFilterBins1To5OfZoomLevel1.rq +++ b/queries/selectLinksFilterBins1To5OfZoomLevel1.rq @@ -2,14 +2,17 @@ PREFIX vg: PREFIX faldo: PREFIX rdfs: -SELECT * +SELECT ?link ?path ?arrivalbinrank ?departurebinrank ?zoomfactor WHERE { ?link a vg:Link; vg:arrival ?arrivalbin; vg:departure ?departurebin; - vg:linkPaths ?path . - # THIS DOES NOT WORK AND I DON'T KNOW WHY ?path rdfs:label ?l . + vg:linkPaths ?path ; + vg:linkZoomLevel ?zoomlevel . ?arrivalbin vg:binRank ?arrivalbinrank . ?departurebin vg:binRank ?departurebinrank . - FILTER((?arrivalbinrank < 6 && ?arrivalbinrank > 0) || (?departurebinrank < 6 && ?departurebinrank > 0)) + FILTER((?arrivalbinrank < 6 && ?arrivalbinrank > 0) + || (?departurebinrank < 6 && ?departurebinrank > 0)) + ?zoomlevel vg:zoomFactor ?zoomfactor . + FILTER(?zoomfactor = 1) } \ No newline at end of file diff --git a/queries/selectPangenomeSeq1To5.rq b/queries/selectPangenomeSeq1To5.rq index 464c299..4159e0a 100644 --- a/queries/selectPangenomeSeq1To5.rq +++ b/queries/selectPangenomeSeq1To5.rq @@ -1,9 +1,11 @@ PREFIX vg: PREFIX rdf: -SELECT (SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq) -{SELECT - * #(SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq) -WHERE {?s a vg:Node; - rdf:value ?sequence . - } -ORDER BY ?s \ No newline at end of file + +SELECT (SUBSTR(group_concat(?sequence; separator=''), 1,5) as ?panSeq) { + SELECT + * + WHERE {?s a vg:Node; + rdf:value ?sequence . + } + ORDER BY ?s +} From 209d6973562d0b3ece51de65d222ec7c69ec3d34 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Fri, 26 Jun 2020 15:04:37 +0200 Subject: [PATCH 16/25] Actually emit the position percentage instead of the coverage of a bin. --- matrixcomponent/JSONparser.py | 2 +- matrixcomponent/PangenomeSchematic.py | 2 +- matrixcomponent/matrix.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/matrixcomponent/JSONparser.py b/matrixcomponent/JSONparser.py index e56302d..51ed15b 100644 --- a/matrixcomponent/JSONparser.py +++ b/matrixcomponent/JSONparser.py @@ -41,7 +41,7 @@ def process_path(line=None): for r in ranges: compressed_ranges.extend(r) - bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges, 0) + bin = matrix.Bin(b[0], b[1], b[2], compressed_ranges, 0, b[3]) p.bins.setdefault(bin.bin_id, bin) # do the major part of the segmentation.find_dividers() method diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 85bf5d8..cf787e0 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -195,7 +195,7 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li ocell.id = cell_counter ocell.path_id = self.path_names[bin.path_id] # saved in the populate_component_matrix ocell.inversion_percent = bin.inversion - ocell.position_percent = bin.coverage + ocell.position_percent = bin.position # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense cell_ns = URIRef(ocell.path_id + "/") diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py index 6efe264..d2f3127 100644 --- a/matrixcomponent/matrix.py +++ b/matrixcomponent/matrix.py @@ -13,6 +13,7 @@ class Bin(recordclass.dataobject): inversion: float nucleotide_ranges: 'numpy.array' # List[List[int]] is encoded as a Numpy flat array - this saves memory path_id: int + position: float ## Path is all for input files From 7dc028bbbe1ce91548886033e1906e709d1530a3 Mon Sep 17 00:00:00 2001 From: Dmytro Trybushnyi Date: Sat, 4 Jul 2020 19:36:01 +0200 Subject: [PATCH 17/25] a big update: * parallel write out of the gzip compressed ontology files - no memory leaks due to the utilization of separate processes! * use the N-triples format to be 10x quicker than the Turtle (see format='nt' in PangenomeSchematic.py) * be gentle with the string variables, do not use "a"+"b" but rather "{0}{1}".format(a,b). This does not create small temporary object and leads to a lower memory fragmentation/leak * the RDF output folder is named *-rdf --- matrixcomponent/PangenomeSchematic.py | 328 +++++++++++++------------- matrixcomponent/matrix.py | 4 - matrixcomponent/ontology.py | 46 +--- segmentation.py | 9 +- 4 files changed, 177 insertions(+), 210 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index cf787e0..489b336 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -1,4 +1,8 @@ +import gzip import json +import os +import shutil + from collections import OrderedDict from statistics import mean @@ -10,6 +14,7 @@ from dataclasses import dataclass +from joblib import delayed from rdflib import URIRef, Graph, Namespace from matrixcomponent import JSON_VERSION, ontology @@ -17,6 +22,144 @@ from DNASkittleUtils.Contigs import Contig, write_contigs_to_file + +# move out of the class to be able to use with joblib +def write_rdf(schematic, ontology_output_path): + zoom_level = ontology.ZoomLevel() + zoom_level.zoom_factor = schematic.bin_width + zoom_level.ns = URIRef('http://example.org/vg/') + + prev_comp_id = -1 + cell_counter = 0 + ocomp_dict = {} + obin_dict = {} + oposition_dict = {} + + min_bin_id = schematic.first_bin + max_bin_id = schematic.last_bin + + for ic, component in enumerate(schematic.components): + ocomp = ontology.Component(ic + 1) + ocomp.ns = zoom_level.ns_term() + '/' + zoom_level.components.append(ocomp) + + # save the sequence 1-2-3-..-n as a bi-directed list + if prev_comp_id in ocomp_dict: + prev_comp = ocomp_dict[prev_comp_id] + ocomp.reverse_component_edge = prev_comp.ns_term() + prev_comp.forward_component_edge = ocomp.ns_term() + + ocomp_dict[ic] = ocomp + prev_comp_id = ic + + obin_ns = ocomp.ns_term() + '/' + obin_tmp = ontology.Bin() + obin_tmp.ns = obin_ns + + # bins + for bins in component.matrix: + for bin in bins[1][1]: # follow the compressed format + if bin: + cur_bin_id = bin.bin_id + obin = ontology.Bin() + obin.ns = obin_ns + obin.bin_rank = cur_bin_id + obin_dict[cur_bin_id] = obin + + if (cur_bin_id > min_bin_id): + obin_tmp.bin_rank = cur_bin_id - 1 + obin.reverse_bin_edge = obin_tmp.ns_term() # string value + if (cur_bin_id < max_bin_id): + obin_tmp.bin_rank = cur_bin_id + 1 + obin.forward_bin_edge = obin_tmp.ns_term() # string value + + ocomp.bins.append(obin) + + cell_counter = cell_counter + 1 + ocell = ontology.Cell() + ocell.id = cell_counter + ocell.path_id = schematic.path_names[bin.path_id] # saved in the populate_component_matrix + ocell.inversion_percent = bin.inversion + ocell.position_percent = bin.position + + # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense + cell_ns = URIRef("{0}/".format(ocell.path_id)) + for be in range(0, len(bin.nucleotide_ranges), 2): + begin, end = bin.nucleotide_ranges[be], bin.nucleotide_ranges[be + 1] + real_begin = begin if begin else end + real_end = end if end else begin + + oregion = ontology.Region() + oregion.begin = real_begin + oregion.end = real_end + ocell.cell_region.append(oregion) + + path = schematic.path_names[bin.path_id] + oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns) + oposition_end = ontology.Position(real_end, begin < end, path, cell_ns) + oposition_dict[oposition_begin.ns_term()] = oposition_begin + oposition_dict[oposition_end.ns_term()] = oposition_end + + obin.cells.append(ocell) + + # links between components and their bins + + olink_dict = {} + link_counter = 0 + for component in schematic.components: + # search in all arrivals component <-> component links; departures are iterated automatically + # every link from departures will be in some other arrival + for link in component.departures: + if len(link.participants): + link_counter = link_counter + 1 + olink = ontology.Link() + olink.id = link_counter + olink_dict[link_counter] = olink + + link_counter = 0 + for component in schematic.components: + # search in all arrivals component <-> component links; departures are iterated automatically + # every link from departures will be in some other arrival + for link in component.departures: + if len(link.participants): + link_counter = link_counter + 1 + olink = olink_dict[link_counter] + + if link.upstream in obin_dict: + from_bin = obin_dict[link.upstream] + olink.departure = from_bin.ns_term() + + if link.downstream in obin_dict: + to_bin = obin_dict[link.downstream] + olink.arrival = to_bin.ns_term() + + olink.paths = [schematic.path_names[k] for k in link.participants] + olink.linkZoomLevel = zoom_level.ns_term() + zoom_level.links.append(olink) + + g = Graph() + vg = Namespace('http://biohackathon.org/resource/vg#') + faldo = Namespace('http://biohackathon.org/resource/faldo#') + g.bind('vg', vg) + g.bind('faldo', faldo) + + # here the magic happens + zoom_level.add_to_graph(g, vg, faldo) + for oposition in oposition_dict.values(): + oposition.add_to_graph(g, vg, faldo) + for path in schematic.path_names: + ontology.Path(path).add_to_graph(g, vg, faldo) + + # format='nt' works 10x faster than 'turtle'; the produced files are 3x bigger + # do not forget to compress them afterwards - gives 15x diskspace reduction + g.serialize(destination=ontology_output_path, format='nt') + with open(ontology_output_path, 'rb') as fin: + with gzip.open(ontology_output_path + '.gz', 'wb') as fout: + shutil.copyfileobj(fin, fout) + fout.close() + os.remove(ontology_output_path) + + @dataclass class PangenomeSchematic: json_version: int @@ -39,14 +182,7 @@ def dumper(obj): ranges.append([flat_ranges[i], flat_ranges[i+1]]) return [obj.coverage, obj.inversion, ranges] if isinstance(obj, LinkColumn): - # todo: get rid of this once the JS side can work with sparse containers - if self.json_version <= 14: - bools = [False] * len(self.path_names) - for i in obj.participants: - bools[i] = True - return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':bools} - else: - return {'upstream':obj.upstream, 'downstream':obj.downstream, 'participants':obj.participants.tolist()} + return {'upstream': obj.upstream, 'downstream': obj.downstream, 'participants': obj.participants.tolist()} if isinstance(obj, set): return list(obj) try: @@ -70,28 +206,7 @@ def update_first_last_bin(self): self.first_bin = 1 # these have not been properly initialized self.last_bin = self.components[-1].last_bin - def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links, ontology_folder): - # todo: get rid of this once the JS side can work with sparse containers - if self.json_version <= 14: - empty = [] - for comp in self.components: - bools = [False] * len(self.path_names) - for i in comp.occupants: - bools[i] = True - comp.occupants = bools - - matrix = [empty] * len(self.path_names) - fb, lb = comp.first_bin, comp.last_bin - for item in comp.matrix: - padded = [empty] * (lb - fb + 1) - sliced = item[1] - for id, val in zip(sliced[0], sliced[1]): - padded[id] = val - matrix[item[0]] = padded - - comp.matrix = matrix - - + def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_links, ontology_folder, parallel): """Splits one Schematic into multiple files with their own unique first and last_bin based on the volume of data desired per file specified by cells_per_file. """ @@ -139,136 +254,20 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li c = folder.joinpath(schematic.fasta_filename(i)) write_contigs_to_file(c, chunk) - if ontology_folder: - zoom_level = ontology.ZoomLevel() - zoom_level.zoom_factor = schematic.bin_width - zoom_level.ns = URIRef('http://example.org/vg/') - - prev_comp_id = -1 - cell_counter = 0 - ocomp_dict = {} - obin_dict = {} - oposition_dict = {} - - min_bin_id = schematic.first_bin - max_bin_id = schematic.last_bin - - for ic, component in enumerate(schematic.components): - ocomp = ontology.Component(ic+1) - ocomp.ns = zoom_level.ns_term() + '/' - zoom_level.components.append(ocomp) - - # save the sequence 1-2-3-..-n as a bi-directed list - if prev_comp_id in ocomp_dict: - prev_comp = ocomp_dict[prev_comp_id] - ocomp.reverse_component_edge = prev_comp.ns_term() - prev_comp.forward_component_edge = ocomp.ns_term() - - ocomp_dict[ic] = ocomp - prev_comp_id = ic - - obin_ns = ocomp.ns_term() + '/' - obin_tmp = ontology.Bin() - obin_tmp.ns = obin_ns - - # bins - for bins in component.matrix: - for bin in bins[1][1]: # follow the compressed format - if bin: - cur_bin_id = bin.bin_id - obin = ontology.Bin() - obin.ns = obin_ns - obin.bin_rank = cur_bin_id - obin_dict[cur_bin_id] = obin - - if (cur_bin_id > min_bin_id): - obin_tmp.bin_rank = cur_bin_id - 1 - obin.reverse_bin_edge = obin_tmp.ns_term() # string value - if (cur_bin_id < max_bin_id): - obin_tmp.bin_rank = cur_bin_id + 1 - obin.forward_bin_edge = obin_tmp.ns_term() # string value - - ocomp.bins.append(obin) - - cell_counter = cell_counter + 1 - ocell = ontology.Cell() - ocell.id = cell_counter - ocell.path_id = self.path_names[bin.path_id] # saved in the populate_component_matrix - ocell.inversion_percent = bin.inversion - ocell.position_percent = bin.position - - # todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense - cell_ns = URIRef(ocell.path_id + "/") - for i in range(0, len(bin.nucleotide_ranges), 2): - begin, end = bin.nucleotide_ranges[i], bin.nucleotide_ranges[i+1] - real_begin = begin if begin else end - real_end = end if end else begin - - oregion = ontology.Region() - oregion.begin = real_begin - oregion.end = real_end - ocell.cell_region.append(oregion) - - path = self.path_names[bin.path_id] - oposition_begin = ontology.Position(real_begin, begin < end, path, cell_ns) - oposition_end = ontology.Position(real_end, begin < end, path, cell_ns) - oposition_dict[oposition_begin.ns_term()] = oposition_begin - oposition_dict[oposition_end.ns_term()] = oposition_end - - obin.cells.append(ocell) - - # links between components and their bins - - olink_dict = {} - link_counter = 0 - for component in schematic.components: - # search in all arrivals component <-> component links; departures are iterated automatically - # every link from departures will be in some other arrival - for link in component.departures: - if len(link.participants): - link_counter = link_counter + 1 - olink = ontology.Link() - olink.id = link_counter - olink_dict[link_counter] = olink - - - link_counter = 0 - for component in schematic.components: - # search in all arrivals component <-> component links; departures are iterated automatically - # every link from departures will be in some other arrival - for link in component.departures: - if len(link.participants): - link_counter = link_counter + 1 - olink = olink_dict[link_counter] - - if link.upstream in obin_dict: - from_bin = obin_dict[link.upstream] - olink.departure = from_bin.ns_term() - - if link.downstream in obin_dict: - to_bin = obin_dict[link.downstream] - olink.arrival = to_bin.ns_term() - - olink.paths = [self.path_names[k] for k in link.participants] - olink.linkZoomLevel = zoom_level.ns_term() - zoom_level.links.append(olink) - - g = Graph() - vg = Namespace('http://biohackathon.org/resource/vg#') - faldo = Namespace('http://biohackathon.org/resource/faldo#') - g.bind('vg', vg) - g.bind('faldo', faldo) - - # here the magic happens - zoom_level.add_to_graph(g, vg, faldo) - for oposition in oposition_dict.values(): - oposition.add_to_graph(g, vg, faldo) - for path in self.path_names: - ontology.Path(path).add_to_graph(g, vg, faldo) - - p = ontology_folder.joinpath(schematic.ttl_filename(i)) - g.serialize(destination=str(p), format='turtle', encoding='utf-8') - + if ontology_folder: + # generator for the pairs (Schematics, path) - will be instantiated on the item access! + prepared_schematics = ( (PangenomeSchematic(JSON_VERSION, self.bin_width, self.components[cut:cut_points[i + 1]][0].first_bin, + self.components[cut:cut_points[i + 1]][-1].last_bin, self.includes_connectors, + self.components[cut:cut_points[i + 1]], self.path_names, + self.total_nr_files, self.pangenome_length), + str(ontology_folder.joinpath(self.ttl_filename(i))) ) + for i, cut in enumerate(cut_points[:-1]) if self.components[cut:cut_points[i + 1]] ) + + if parallel: + results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics) + else: + for (sch, path) in prepared_schematics: + write_rdf(sch, path) return bin2file_mapping @@ -287,12 +286,7 @@ def find_cut_points_in_file_split(self, columns_per_file, column_counts): def lazy_average_occupants(self): """grab four random components and check how many occupants they have""" samples = [self.components[int(len(self.components) * (perc/100))] for perc in range(1, 99)] - - # todo: get rid of this once the JS side can work with sparse containers - if self.json_version <= 14: - avg_paths = mean([sum(x.occupants) for x in samples]) - else: - avg_paths = mean([len(x.occupants) for x in samples]) + avg_paths = mean([len(x.occupants) for x in samples]) return avg_paths @@ -306,7 +300,7 @@ def fasta_filename(self, nth_file): return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa' def ttl_filename(self, nth_file): - return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.ttl' + return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.nt' def write_index_file(self, folder, bin2file_mapping): diff --git a/matrixcomponent/matrix.py b/matrixcomponent/matrix.py index d2f3127..b2ba9e9 100644 --- a/matrixcomponent/matrix.py +++ b/matrixcomponent/matrix.py @@ -38,10 +38,6 @@ class LinkColumn(recordclass.dataobject): downstream: int participants: 'numpy.array' # ids of participated path_names - # todo: not more than 2^32 bins are supported - refactor the ontology code processing the Link items - def __hash__(self): - return (self.upstream << 32) + self.downstream - class Component: """Block of co-linear variation within a Graph Matrix diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 8d9b8e3..97479ef 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -9,7 +9,7 @@ def __init__(self, path): self.path = path def ns_term(self): - return "path/" + self.path # path1 + return "path/{0}".format(self.path) # path1 def add_to_graph(self, graph: Graph, vg: Namespace, faldo: Namespace) -> None: path = URIRef(self.ns_term()) # str representation @@ -30,11 +30,8 @@ def __init__(self, id, is_forward, path, ns): self.path = path self.ns = ns - def __str__(self): - return str(self.id) - def ns_term(self): - return self.ns + str(self) # path1/2 + return self.ns + "{0}".format(str(self.id)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: position = self.ns_term() # str representation @@ -56,11 +53,8 @@ class Region: begin: int end: int - def __str__(self): - return str(self.begin) + "-" + str(self.end) # 2-10 - def ns_term(self): - return self.ns + str(self) # path1/... + return self.ns + "{0}-{1}".format(str(self.begin), str(self.end)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: region = self.ns_term() # str representation @@ -86,15 +80,12 @@ class Cell: def __init__(self): self.cell_region = [] - def __str__(self): - return "cell" + self.path_id # cell1 - def ns_term(self): - return self.ns + str(self) + return self.ns + "cell{0}".format(str(self.path_id)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: cell = self.ns_term() - inner_ns = URIRef(self.path_id + "/") + inner_ns = URIRef("{0}/".format(self.path_id)) # add the object itself graph.add((cell, RDF.type, vg.Cell)) @@ -120,11 +111,8 @@ def __init__(self): self.forward_bin_edge = '' self.reverse_bin_edge = '' - def __str__(self): - return "bin" + str(self.bin_rank) # bin1 - def ns_term(self): - return self.ns + str(self) + return self.ns + "bin{0}".format(str(self.bin_rank)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: bin = self.ns_term() @@ -166,17 +154,11 @@ def __init__(self): self.departure = '' self.linkZoomLevel = '' - def __str__(self): - return "link" + str(self.id) # bin1 - - def ns_term(self): - return self.ns + str(self) - + return self.ns + "link{0}".format(str(self.id)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: link = self.ns_term() - inner_ns = link + "/" # add the object itself graph.add((link, RDF.type, vg.Link)) @@ -197,10 +179,10 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: # add the reference to another object if needed if self.forward_link_edge > -1: - graph.add((link, vg.forwardLinkEdge, self.ns + "link" + str(self.forward_link_edge))) + graph.add((link, vg.forwardLinkEdge, "{0}link{1}".format(self.ns, str(self.forward_link_edge)))) if self.reverse_link_edge > -1: - graph.add((link, vg.reverseLinkEdge, self.ns + "link" + str(self.reverse_link_edge))) + graph.add((link, vg.reverseLinkEdge, "{0}link{1}".format(self.ns, str(self.reverse_link_edge)))) class Component: @@ -217,11 +199,8 @@ def __init__(self, id): self.forward_component_edge = '' self.reverse_component_edge = '' - def __str__(self): - return "component" + str(self.id) # component1 - def ns_term(self): - return self.ns + self.__str__() + return self.ns + "component{0}".format(str(self.id)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: component = self.ns_term() @@ -256,11 +235,8 @@ def __init__(self): self.components = [] self.links = [] - def __str__(self): - return "zoom" + str(self.zoom_factor) #zoom1000 - def ns_term(self): - return self.ns + self.__str__() + return self.ns + "zoom{0}".format(str(self.zoom_factor)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: zoomfactor = self.ns_term() diff --git a/segmentation.py b/segmentation.py index cb36fd4..b5f3e24 100644 --- a/segmentation.py +++ b/segmentation.py @@ -282,7 +282,7 @@ def _split_lines(self, text, width): return argparse.HelpFormatter._split_lines(self, text, width) -def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic, no_adjacent_links): +def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeSchematic, no_adjacent_links, parallel): os.makedirs(folder, exist_ok=True) # make directory for all files if ontology_folder: os.makedirs(ontology_folder, exist_ok=True) @@ -291,7 +291,8 @@ def write_files(folder, ontology_folder, odgi_fasta: Path, schematic: PangenomeS if odgi_fasta: fasta = read_contigs(odgi_fasta)[0] - bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta, no_adjacent_links, ontology_folder) + bin2file_mapping = schematic.split_and_write(args.cells_per_file, folder, fasta, no_adjacent_links, + ontology_folder, parallel) schematic.write_index_file(folder, bin2file_mapping) @@ -405,9 +406,9 @@ def main(): ontology_folder_path = None if args.do_ttl: - ontology_folder_path = osPath(args.output_folder).joinpath(path_name + '-turtle') + ontology_folder_path = osPath(args.output_folder).joinpath(path_name + '-rdf') - write_files(folder_path, ontology_folder_path, args.fasta, schematic, args.no_adjacent_links) + write_files(folder_path, ontology_folder_path, args.fasta, schematic, args.no_adjacent_links, parallel) LOGGER.info("Finished processing the file " + json_file) From 46d9c204798b3735021e3f247ffe1123582fa2b2 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama <6br@users.noreply.github.com> Date: Sat, 18 Jul 2020 23:40:15 +0900 Subject: [PATCH 18/25] Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a40f53d..f2ac401 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ DNASkittleUtils==1.0.13 numpy==1.18.2 pytest==5.4.1 sortedcontainers==2.1.0 -joblib==0.14.1 +joblib==0.1 numba==0.48.0 psutil==5.7.0 -recordclass==0.13.2 \ No newline at end of file +recordclass==0.13.2 From 1f5a379184b9c2427f14a7619bc34ecb39270685 Mon Sep 17 00:00:00 2001 From: 6br Date: Sun, 26 Jul 2020 12:41:07 +0900 Subject: [PATCH 19/25] Add 'path/' on cells --- matrixcomponent/ontology.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 97479ef..3922f1d 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -81,11 +81,11 @@ def __init__(self): self.cell_region = [] def ns_term(self): - return self.ns + "cell{0}".format(str(self.path_id)) + return self.ns + "cell{0}/path/".format(str(self.path_id)) def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: cell = self.ns_term() - inner_ns = URIRef("{0}/".format(self.path_id)) + inner_ns = URIRef("{0}/path/".format(self.path_id)) # add the object itself graph.add((cell, RDF.type, vg.Cell)) From e014b732b1ff647dc2d5e02f4869f16b4e1bfa69 Mon Sep 17 00:00:00 2001 From: 6br Date: Sun, 26 Jul 2020 13:01:34 +0900 Subject: [PATCH 20/25] Add assertion --- matrixcomponent/PangenomeSchematic.py | 1 + matrixcomponent/ontology.py | 1 + 2 files changed, 2 insertions(+) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 489b336..1b137e9 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -124,6 +124,7 @@ def write_rdf(schematic, ontology_output_path): if len(link.participants): link_counter = link_counter + 1 olink = olink_dict[link_counter] + assert link.upstream in obin_dict == link.downstream in obin_dict if link.upstream in obin_dict: from_bin = obin_dict[link.upstream] diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 3922f1d..6d6196b 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -168,6 +168,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: if self.linkZoomLevel: graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel))) + assert self.arrival and self.departure if self.arrival: graph.add((link, vg.arrival, URIRef(self.arrival))) From 9beafb3acfb5acf44907d3e1b5bf7a5580918037 Mon Sep 17 00:00:00 2001 From: 6br Date: Sun, 26 Jul 2020 13:14:09 +0900 Subject: [PATCH 21/25] Add assertion --- matrixcomponent/PangenomeSchematic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 1b137e9..38c0054 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -124,14 +124,14 @@ def write_rdf(schematic, ontology_output_path): if len(link.participants): link_counter = link_counter + 1 olink = olink_dict[link_counter] - assert link.upstream in obin_dict == link.downstream in obin_dict + assert link.upstream in obin_dict == link.downstream in obin_dict, "upstream: {0}, downstream {1}".format(link.upstream, link.downstream) if link.upstream in obin_dict: from_bin = obin_dict[link.upstream] olink.departure = from_bin.ns_term() - if link.downstream in obin_dict: - to_bin = obin_dict[link.downstream] + if lin_k.downstream in obin_dict: + tobin = obin_dict[link.downstream] olink.arrival = to_bin.ns_term() olink.paths = [schematic.path_names[k] for k in link.participants] From 8a336d905fbb2d1dd316ed98697dec2d4343cd93 Mon Sep 17 00:00:00 2001 From: 6br Date: Sun, 26 Jul 2020 14:08:47 +0900 Subject: [PATCH 22/25] Add logger info --- matrixcomponent/PangenomeSchematic.py | 9 ++++++--- matrixcomponent/ontology.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 38c0054..9496984 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -124,15 +124,18 @@ def write_rdf(schematic, ontology_output_path): if len(link.participants): link_counter = link_counter + 1 olink = olink_dict[link_counter] - assert link.upstream in obin_dict == link.downstream in obin_dict, "upstream: {0}, downstream {1}".format(link.upstream, link.downstream) if link.upstream in obin_dict: from_bin = obin_dict[link.upstream] olink.departure = from_bin.ns_term() + else: + LOGGER.info(f"No upstream {link.upstream}") - if lin_k.downstream in obin_dict: - tobin = obin_dict[link.downstream] + if link.downstream in obin_dict: + to_bin = obin_dict[link.downstream] olink.arrival = to_bin.ns_term() + else: + LOGGER.info(f"No downstream {link.downstream}") olink.paths = [schematic.path_names[k] for k in link.participants] olink.linkZoomLevel = zoom_level.ns_term() diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 6d6196b..d9e7301 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -168,7 +168,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: if self.linkZoomLevel: graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel))) - assert self.arrival and self.departure + # assert self.arrival and self.departure if self.arrival: graph.add((link, vg.arrival, URIRef(self.arrival))) From 61ceb227323812012fa50e0e06294004df64c731 Mon Sep 17 00:00:00 2001 From: 6br Date: Sun, 26 Jul 2020 14:08:47 +0900 Subject: [PATCH 23/25] Register logger --- matrixcomponent/PangenomeSchematic.py | 11 ++++++++--- matrixcomponent/ontology.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 38c0054..08ed5e5 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -22,6 +22,8 @@ from DNASkittleUtils.Contigs import Contig, write_contigs_to_file +LOGGER = logging.getLogger(__name__) +"""logging.Logger: The logger for this module""" # move out of the class to be able to use with joblib def write_rdf(schematic, ontology_output_path): @@ -124,15 +126,18 @@ def write_rdf(schematic, ontology_output_path): if len(link.participants): link_counter = link_counter + 1 olink = olink_dict[link_counter] - assert link.upstream in obin_dict == link.downstream in obin_dict, "upstream: {0}, downstream {1}".format(link.upstream, link.downstream) if link.upstream in obin_dict: from_bin = obin_dict[link.upstream] olink.departure = from_bin.ns_term() + else: + LOGGER.info(f"No upstream {link.upstream}") - if lin_k.downstream in obin_dict: - tobin = obin_dict[link.downstream] + if link.downstream in obin_dict: + to_bin = obin_dict[link.downstream] olink.arrival = to_bin.ns_term() + else: + LOGGER.info(f"No downstream {link.downstream}") olink.paths = [schematic.path_names[k] for k in link.participants] olink.linkZoomLevel = zoom_level.ns_term() diff --git a/matrixcomponent/ontology.py b/matrixcomponent/ontology.py index 6d6196b..d9e7301 100644 --- a/matrixcomponent/ontology.py +++ b/matrixcomponent/ontology.py @@ -168,7 +168,7 @@ def add_to_graph(self, graph: Graph, vg, faldo: Namespace) -> None: if self.linkZoomLevel: graph.add((link, vg.linkZoomLevel, URIRef(self.linkZoomLevel))) - assert self.arrival and self.departure + # assert self.arrival and self.departure if self.arrival: graph.add((link, vg.arrival, URIRef(self.arrival))) From 86eca81d850ba3f84e81f0e72b509824db07a72b Mon Sep 17 00:00:00 2001 From: 6br Date: Sun, 26 Jul 2020 14:54:35 +0900 Subject: [PATCH 24/25] Add bin logger --- matrixcomponent/PangenomeSchematic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index 8e433e0..ccdd9d1 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -106,6 +106,7 @@ def write_rdf(schematic, ontology_output_path): obin.cells.append(ocell) # links between components and their bins + LOGGER.info(f"Bin dictionary {obin_dict}") olink_dict = {} link_counter = 0 From f7047679d8a037a3faa0d621fb59856a605fee80 Mon Sep 17 00:00:00 2001 From: 6br Date: Sun, 26 Jul 2020 16:23:35 +0900 Subject: [PATCH 25/25] Disable parallel on rdf writer --- matrixcomponent/PangenomeSchematic.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/matrixcomponent/PangenomeSchematic.py b/matrixcomponent/PangenomeSchematic.py index ccdd9d1..dda5ff1 100644 --- a/matrixcomponent/PangenomeSchematic.py +++ b/matrixcomponent/PangenomeSchematic.py @@ -271,11 +271,12 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig, no_adjacent_li str(ontology_folder.joinpath(self.ttl_filename(i))) ) for i, cut in enumerate(cut_points[:-1]) if self.components[cut:cut_points[i + 1]] ) - if parallel: - results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics) - else: - for (sch, path) in prepared_schematics: - write_rdf(sch, path) + #TODO() Now parallel is disabled because links overlap across partial pangenomic schematic occurs, though they cannot rescue because the communication across threads is not implemented yet. + + #if parallel: + # results = parallel(delayed(write_rdf)(sch, path) for (sch, path) in prepared_schematics) + #else: + write_rdf(self, str(ontology_folder.joinpath(self.ttl_filename(0)))) return bin2file_mapping