Skip to content

Commit

Permalink
Merge pull request galaxyproject#16576 from claudiofr/dt_chain
Browse files Browse the repository at this point in the history
Support new genome browser chain file format
  • Loading branch information
mvdbeek committed Aug 24, 2023
2 parents 5cbc192 + 635a792 commit 02fb8c2
Show file tree
Hide file tree
Showing 3 changed files with 478 additions and 0 deletions.
2 changes: 2 additions & 0 deletions lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,7 @@
<datatype extension="shp" type="galaxy.datatypes.gis:Shapefile" mimetype="application/octet-stream" display_in_upload="true" description="geospatial vector data format for geographic information system"/>
<!-- Flexible Image Transport System (FITS) used in Astronomy https://fits.gsfc.nasa.gov/ https://fits.gsfc.nasa.gov/rfc4047.txt -->
<datatype extension="fits" type="galaxy.datatypes.binary:FITS" mimetype="application/octet-stream" display_in_upload="true" description="Flexible Image Transport System (FITS) used in Astronomy"/>
<datatype extension="chain" type="galaxy.datatypes.chain:Chain" display_in_upload="true"/>
</registration>
<sniffers>
<!--
Expand Down Expand Up @@ -1061,6 +1062,7 @@
<sniffer type="galaxy.datatypes.binary:Pretext"/>
<sniffer type="galaxy.datatypes.annotation:Augustus"/>
<sniffer type="galaxy.datatypes.xml:Owl"/>
<sniffer type="galaxy.datatypes.chain:Chain"/>
<sniffer type="galaxy.datatypes.triples:Rdf"/>
<sniffer type="galaxy.datatypes.blast:BlastXml"/>
<sniffer type="galaxy.datatypes.images:Gifti" />
Expand Down
129 changes: 129 additions & 0 deletions lib/galaxy/datatypes/chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
Genome browser chain format class
"""

import logging

from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.protocols import DatasetProtocol
from galaxy.datatypes.sniff import (
build_sniff_from_prefix,
FilePrefix,
)
from galaxy.util import (
commaify,
compression_utils,
nice_size,
)
from . import data

log = logging.getLogger(__name__)


@build_sniff_from_prefix
class Chain(data.Text):
"""Class describing a chain format alignment file"""

edam_format = "format_3982"
file_ext = "chain"

strands = ["+", "-"]

MetadataElement(
name="chains", default=0, desc="Number of chains", readonly=True, visible=False, optional=False, no_value=0
)

def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
"""
Set the number of chains and the number of data lines in dataset.
"""
data_lines = 0
chains = 0
with compression_utils.get_fileobj(dataset.file_name) as fh:
for line in fh:
line = line.strip()
if line and line.startswith("#"):
# We don't count comment lines for sequence data types
continue
if line and line.startswith("chain"):
chains += 1
data_lines += 1
else:
data_lines += 1
dataset.metadata.data_lines = data_lines
dataset.metadata.chains = chains

def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
if dataset.metadata.chains:
dataset.blurb = f"{commaify(str(dataset.metadata.chains))} chains"
else:
dataset.blurb = nice_size(dataset.get_size())
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"

def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is in chain format
For complete details see https://genome.ucsc.edu/goldenPath/help/chain.html
Rules for sniffing as True:
We don't care about line length (other than empty lines).
The first non-empty line must start with 'chain' and the Very Next line.strip() must have an alignment data line
which consists of either 1 or 3 integers separated by spaces.
The chain line must have at least 12 tokens representing the chain attributes.
The 2 strand attributes must have values + or -. We verify that some of the
other numeric attributes such as sequence length start/stop positions are
integers.
We will only check that the first chain and alignment data lines are formatted correctly.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'sequence.maf' )
>>> Chain().sniff( fname )
False
>>> fname = get_test_fname( '1.chain' )
>>> Chain().sniff( fname )
True
>>>
"""
fh = file_prefix.string_io()
for line in fh:
line = line.strip()
if line: # first non-empty line
if line.startswith("chain"):
# The next line.strip() must not be '', nor startwith '>'
tokens = line.split()
if not (
len(tokens) in [12, 13]
and tokens[4] in self.strands
and tokens[9] in self.strands
and tokens[3].isdigit()
and tokens[5].isdigit()
and tokens[6].isdigit()
):
return False
prior_token_len = 0
for line in fh:
line = line.strip()
if line == "":
break
tokens = line.split()
if prior_token_len == 1:
return False
if len(tokens) not in [1, 3]:
return False
if not all(token.isdigit() for token in tokens):
return False
prior_token_len = len(tokens)
if prior_token_len == 1:
return True
else:
return False
return False
Loading

0 comments on commit 02fb8c2

Please sign in to comment.