Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ontology repacking and exporting - relates to #34 #44

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
acb580f
do the data repack into the ontology containers with further RDF 'tur…
dimatr Apr 28, 2020
5c39f2a
Merge remote-tracking branch 'remotes/origin/master' into ontology
dimatr Apr 30, 2020
44caccc
further ontology fixes: real path names; better forward* and reverse*…
dimatr Apr 30, 2020
29550fd
- do not forget to store the path_id
dimatr May 11, 2020
b8d0e16
- explicit faldo:ExactPosition containers
dimatr May 12, 2020
1eaacd0
Merge branch 'master' into ontology
dimatr Jun 16, 2020
26e6901
write faldo:ForwardStrandPosition, faldo:position and faldo:reference…
dimatr Jun 16, 2020
db95d1d
create ontology folder when needed
dimatr Jun 16, 2020
fb1a3ec
proper name and strand directions
dimatr Jun 16, 2020
8e233ec
directions
dimatr Jun 16, 2020
d0bff9a
fix bin_edge in .ttl output; but might be slow
subwaystation Jun 17, 2020
3741e10
recycle temp objects, do not create new ones
dimatr Jun 18, 2020
d50210b
positionPercent and inversionPercent are printed as doubles
dimatr Jun 18, 2020
3f4984c
fix orientation
subwaystation Jun 22, 2020
a96974e
add base IRI
subwaystation Jun 24, 2020
350e85d
added example SPARQL queries
subwaystation Jun 24, 2020
d2eb34c
Link -> ZoomLevel, replace pg with vg, add 'path/'
subwaystation Jun 26, 2020
209d697
Actually emit the position percentage instead of the coverage of a bin.
subwaystation Jun 26, 2020
7dc028b
a big update:
dimatr Jul 4, 2020
46d9c20
Update requirements.txt
6br Jul 18, 2020
1f5a379
Add 'path/' on cells
6br Jul 26, 2020
e014b73
Add assertion
6br Jul 26, 2020
9beafb3
Add assertion
6br Jul 26, 2020
8a336d9
Add logger info
6br Jul 26, 2020
61ceb22
Register logger
6br Jul 26, 2020
7e9f6f5
Merge branch 'ontology' of https://github.com/graph-genome/component_…
6br Jul 26, 2020
6378acd
Register logger
6br Jul 26, 2020
545e4c1
Register logger
6br Jul 26, 2020
86eca81
Add bin logger
6br Jul 26, 2020
f704767
Disable parallel on rdf writer
6br Jul 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/chrk_ath_12samples_10kb.w100000_S.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{"odgi_version": 10,"bin_width": 100000,"pangenome_length": 210026186}
{"odgi_version": 12,"bin_width": 100000,"pangenome_length": 210026186}
{"bin_id":1}
{"bin_id":2}
{"bin_id":3}
Expand Down
2 changes: 1 addition & 1 deletion matrixcomponent/JSONparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def process_path(line=None):
if type(ranges) is not list and len(b) >= 6:
ranges = [[b[4], b[5]]]

bin = matrix.Bin(b[0], b[1], b[2], ranges)
bin = matrix.Bin(b[0], b[1], b[2], ranges, 0) # path_id = 0
p.bins.setdefault(bin.bin_id, bin)

p.links = np.asarray(path['links'], dtype='int32')
Expand Down
111 changes: 109 additions & 2 deletions matrixcomponent/PangenomeSchematic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

from dataclasses import dataclass

from matrixcomponent import JSON_VERSION
from rdflib import URIRef, Graph, Namespace

from matrixcomponent import JSON_VERSION, ontology
from matrixcomponent.matrix import Component, Bin, LinkColumn

from DNASkittleUtils.Contigs import Contig, write_contigs_to_file
Expand Down Expand Up @@ -56,7 +58,7 @@ def update_first_last_bin(self):
self.first_bin = 1 # these have not been properly initialized
self.last_bin = self.components[-1].last_bin

def split_and_write(self, cells_per_file, folder, fasta : Contig):
def split_and_write(self, cells_per_file, folder, fasta : Contig, ontology_folder):
"""Splits one Schematic into multiple files with their own
unique first and last_bin based on the volume of data desired per
file specified by cells_per_file. """
Expand Down Expand Up @@ -100,6 +102,108 @@ def split_and_write(self, cells_per_file, folder, fasta : Contig):
c = folder.joinpath(schematic.fasta_filename(i))
write_contigs_to_file(c, chunk)

if ontology_folder:
zoom_level = ontology.ZoomLevel()
zoom_level.zoom_factor = schematic.bin_width
zoom_level.ns = URIRef('pg/')

prev_comp_id = -1
cell_counter = 0
ocomp_dict = {}
obin_dict = {}
for ic, component in enumerate(schematic.components):
ocomp = ontology.Component(ic+1)
ocomp.ns = zoom_level.ns_term() + '/'
zoom_level.components.append(ocomp)

# save the sequence 1-2-3-..-n as a bi-directed list
if prev_comp_id in ocomp_dict:
prev_comp = ocomp_dict[prev_comp_id]
ocomp.reverse_component_edge = prev_comp.ns_term()
prev_comp.forward_component_edge = ocomp.ns_term()

ocomp_dict[ic] = ocomp
prev_comp_id = ic

# bins
for bins in component.matrix:
prev_bin_id = -1
for bin in bins:
if bin:
cur_bin_id = bin.bin_id
obin = ontology.Bin()
obin.ns = ocomp.ns_term() + '/'
obin.bin_rank = cur_bin_id
obin_dict[cur_bin_id] = obin

# save the sequence 1-2-3-..-n as a bi-directed list
if prev_bin_id in obin_dict:
prev_bin = obin_dict[prev_bin_id]
prev_bin.forward_bin_edge = obin.ns_term()
obin.reverse_bin_edge = prev_bin.ns_term()

prev_bin_id = cur_bin_id
ocomp.bins.append(obin)

cell_counter = cell_counter + 1
ocell = ontology.Cell()
ocell.id = cell_counter
ocell.path_id = self.path_names[bin.path_id] # saved in the populate_component_matrix
ocell.inversion_percent = bin.inversion
ocell.position_percent = bin.coverage

# todo: are begin,end the real bin_ids or the compressed ones? a sparse list sense
for [begin, end] in bin.nucleotide_ranges:
oregion = ontology.Region()
oregion.begin = begin
oregion.end = end
ocell.cell_region.append(oregion)

obin.cells.append(ocell)

# links between components and their bins

all_links = []
for component in schematic.components:
# search in both arrivals and departures of the component <-> component links
all_links.extend(component.arrivals + component.departures)

link_counter = 0
for link in set(all_links): # no duplications; we use Link.__hash__() here - not nice ..
if len(link.participants):
link_counter = link_counter + 1
olink = ontology.Link()
olink.id = link_counter

6br marked this conversation as resolved.
Show resolved Hide resolved
from_bin = None
to_bin = None
if link.upstream in obin_dict:
from_bin = obin_dict[link.upstream]
from_bin.forward_bin_edge = link.downstream

if link.downstream in obin_dict:
to_bin = obin_dict[link.downstream]
olink.arrival = to_bin.ns_term()

if from_bin and to_bin:
from_bin.forward_bin_edge = to_bin.ns_term()
to_bin.reverse_bin_edge = from_bin.ns_term()

olink.paths = [self.path_names[k] for k in link.participants]
zoom_level.links.append(olink)

g = Graph()
vg = Namespace('http://biohackathon.org/resource/vg#')
faldo = Namespace('http://biohackathon.org/resource/faldo#')
g.bind('vg', vg)
g.bind('faldo', faldo)

zoom_level.add_to_graph(g, vg, faldo) # here the magic happens

p = ontology_folder.joinpath(schematic.ttl_filename(i))
g.serialize(destination=str(p), format='turtle', encoding='utf-8')


return bin2file_mapping

def find_cut_points_in_file_split(self, columns_per_file, column_counts):
Expand Down Expand Up @@ -129,6 +233,9 @@ def filename(self, nth_file):
def fasta_filename(self, nth_file):
return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.fa'

def ttl_filename(self, nth_file):
return f'seq_chunk{self.pad_file_nr(nth_file)}_bin{self.bin_width}.ttl'

def write_index_file(self, folder, bin2file_mapping):

file_contents = {'bin_width': self.bin_width,
Expand Down
5 changes: 5 additions & 0 deletions matrixcomponent/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Bin:
coverage: float
inversion: float
nucleotide_ranges: List[List[int]]
path_id: int
sequence: str = ''

## Path is all for input files
Expand All @@ -35,6 +36,10 @@ class LinkColumn:
num_paths: int
participants: 'numpy.array' # ids of participated path_names

# todo: not more than 2^32 bins are supported - refactor the ontology code processing the Link items
def __hash__(self):
return (self.upstream << 32) + self.downstream


class Component:
"""Block of co-linear variation within a Graph Matrix
Expand Down
Loading