Skip to content

Commit 1eebe94

Browse files
committed
Add bigtools bigbed writer
1 parent 43bda60 commit 1eebe94

File tree

2 files changed

+169
-141
lines changed

2 files changed

+169
-141
lines changed

bioframe/io/bed.py

Lines changed: 58 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -597,50 +597,15 @@ def parse_bed_schema(schema: str) -> tuple[int, bool]:
597597
return n, extended
598598

599599

600-
def to_bed(
600+
def to_bed_dataframe(
601601
df: pd.DataFrame,
602-
path: str | pathlib.Path | None = None,
603-
*,
604602
schema: str = "infer",
605603
validate_fields: bool = True,
606604
require_sorted: bool = False,
607605
chromsizes: dict | pd.Series | None = None,
608606
strict_score: bool = False,
609607
replace_na: bool = True,
610-
na_rep: str = "nan",
611-
) -> str | None:
612-
"""Write a DataFrame to a BED file.
613-
614-
Parameters
615-
----------
616-
df : pd.DataFrame
617-
DataFrame to write.
618-
path : str or Path, optional
619-
Path to write the BED file to. If ``None``, the serialized BED file is
620-
returned as a string.
621-
schema : str, optional [default: "infer"]
622-
BED schema to use. If ``"infer"``, the schema is inferred from the
623-
DataFrame's columns.
624-
validate_fields : bool, optional [default: True]
625-
Whether to validate the fields of the BED file.
626-
require_sorted : bool, optional [default: False]
627-
Whether to require the BED file to be sorted.
628-
chromsizes : dict or pd.Series, optional
629-
Chromosome sizes to validate against.
630-
strict_score : bool, optional [default: False]
631-
Whether to strictly enforce validation of the score field (0-1000).
632-
replace_na : bool, optional [default: True]
633-
Whether to replace null values of standard BED fields with
634-
compliant uninformative values.
635-
na_rep : str, optional [default: "nan"]
636-
String representation of null values if written.
637-
638-
Returns
639-
-------
640-
str or None:
641-
The serialized BED file as a string if ``path`` is ``None``, otherwise
642-
``None``.
643-
"""
608+
) -> pd.DataFrame:
644609
if schema == "infer":
645610
n, extended = infer_bed_schema(df)
646611
else:
@@ -712,4 +677,60 @@ def to_bed(
712677
if col in custom_cols:
713678
bed[col] = df[col]
714679

680+
return bed
681+
682+
683+
def to_bed(
684+
df: pd.DataFrame,
685+
path: str | pathlib.Path | None = None,
686+
*,
687+
schema: str = "infer",
688+
validate_fields: bool = True,
689+
require_sorted: bool = False,
690+
chromsizes: dict | pd.Series | None = None,
691+
strict_score: bool = False,
692+
replace_na: bool = True,
693+
na_rep: str = "nan",
694+
) -> str | None:
695+
"""Write a DataFrame to a BED file.
696+
697+
Parameters
698+
----------
699+
df : pd.DataFrame
700+
DataFrame to write.
701+
path : str or Path, optional
702+
Path to write the BED file to. If ``None``, the serialized BED file is
703+
returned as a string.
704+
schema : str, optional [default: "infer"]
705+
BED schema to use. If ``"infer"``, the schema is inferred from the
706+
DataFrame's columns.
707+
validate_fields : bool, optional [default: True]
708+
Whether to validate the fields of the BED file.
709+
require_sorted : bool, optional [default: False]
710+
Whether to require the BED file to be sorted.
711+
chromsizes : dict or pd.Series, optional
712+
Chromosome sizes to validate against.
713+
strict_score : bool, optional [default: False]
714+
Whether to strictly enforce validation of the score field (0-1000).
715+
replace_na : bool, optional [default: True]
716+
Whether to replace null values of standard BED fields with
717+
compliant uninformative values.
718+
na_rep : str, optional [default: "nan"]
719+
String representation of null values if written.
720+
721+
Returns
722+
-------
723+
str or None:
724+
The serialized BED file as a string if ``path`` is ``None``, otherwise
725+
``None``.
726+
"""
727+
bed = to_bed_dataframe(
728+
df,
729+
schema=schema,
730+
validate_fields=validate_fields,
731+
require_sorted=require_sorted,
732+
chromsizes=chromsizes,
733+
strict_score=strict_score,
734+
replace_na=replace_na,
735+
)
715736
return bed.to_csv(path, sep="\t", na_rep=na_rep, index=False, header=False)

bioframe/io/fileops.py

Lines changed: 111 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -245,22 +245,21 @@ def read_alignments(fp, chrom=None, start=None, end=None):
245245
raise ImportError("pysam is required to use `read_alignments`") from None
246246

247247
ext = os.path.splitext(fp)[1]
248-
if ext == '.sam':
249-
mode = 'r'
250-
elif ext == '.bam':
251-
mode = 'rb'
252-
elif ext == '.cram':
253-
mode = 'rc'
248+
if ext == ".sam":
249+
mode = "r"
250+
elif ext == ".bam":
251+
mode = "rb"
252+
elif ext == ".cram":
253+
mode = "rc"
254254
else:
255-
raise ValueError(f'{ext} is not a supported filetype')
255+
raise ValueError(f"{ext} is not a supported filetype")
256256

257257
with closing(pysam.AlignmentFile(fp, mode)) as f:
258258
records = []
259259
for s in f.fetch(chrom, start, end):
260260
# Needed because array.array is not json serializable
261261
tags = [
262-
(k, v.tolist() if isinstance(v, array.array) else v)
263-
for k, v in s.tags
262+
(k, v.tolist() if isinstance(v, array.array) else v) for k, v in s.tags
264263
]
265264
records.append(
266265
(
@@ -487,9 +486,38 @@ def read_bigbed(path, chrom, start=None, end=None, engine="auto"):
487486
return df
488487

489488

490-
def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_binary=None):
491-
"""
492-
Save a bedGraph-like dataframe as a binary BigWig track.
489+
def _find_ucsc_binary(path, cmd):
490+
if path is None:
491+
try:
492+
assert shutil.which(cmd) is not None
493+
except Exception:
494+
raise ValueError(
495+
f"{cmd} is not present in the current environment. "
496+
f"Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
497+
f"install it with, for example, conda install -y -c bioconda "
498+
f"ucsc-{cmd.lower()} "
499+
) from None
500+
elif path.endswith(cmd):
501+
if not os.path.isfile(path) and os.access(path, os.X_OK):
502+
raise ValueError(
503+
f"{cmd} is absent in the provided path or cannot be "
504+
f"executed: {path}. "
505+
)
506+
cmd = path
507+
else:
508+
cmd = os.path.join(path, cmd)
509+
if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
510+
raise ValueError(
511+
f"{cmd} is absent in the provided path or cannot be "
512+
f"executed: {path}. "
513+
)
514+
return cmd
515+
516+
517+
def to_bigwig(
518+
df, chromsizes, outpath, value_field=None, engine="ucsc", path_to_binary=None
519+
):
520+
"""Save a bedGraph-like dataframe as a binary BigWig file.
493521
494522
Parameters
495523
----------
@@ -507,7 +535,6 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
507535
Provide system path to the bedGraphToBigWig binary.
508536
engine : {'ucsc', 'bigtools'}, optional
509537
Engine to use for creating the BigWig file.
510-
511538
"""
512539

513540
is_bedgraph = True
@@ -528,43 +555,21 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
528555
bg["chrom"] = bg["chrom"].astype(str)
529556
bg = bg.sort_values(["chrom", "start", "end"])
530557

531-
if chromsizes is None:
532-
chromsizes = df.groupby('chrom')['end']
533-
534-
if engine.lower() == 'ucsc':
535-
if path_to_binary is None:
536-
cmd = "bedGraphToBigWig"
537-
try:
538-
assert shutil.which(cmd) is not None
539-
except Exception:
540-
raise ValueError(
541-
"bedGraphToBigWig is not present in the current environment. "
542-
"Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
543-
"install it with, for example, conda install -y -c bioconda "
544-
"ucsc-bedgraphtobigwig "
545-
) from None
546-
elif path_to_binary.endswith("bedGraphToBigWig"):
547-
if not os.path.isfile(path_to_binary) and os.access(path_to_binary, os.X_OK):
548-
raise ValueError(
549-
f"bedGraphToBigWig is absent in the provided path or cannot be "
550-
f"fexecuted: {path_to_binary}. "
551-
)
552-
cmd = path_to_binary
553-
else:
554-
cmd = os.path.join(path_to_binary, "bedGraphToBigWig")
555-
if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
556-
raise ValueError(
557-
f"bedGraphToBigWig is absent in the provided path or cannot be "
558-
f"executed: {path_to_binary}. "
559-
)
558+
if engine.lower() == "ucsc":
559+
cmd = _find_ucsc_binary(path_to_binary, "bedGraphToBigWig")
560560

561561
with tempfile.NamedTemporaryFile(suffix=".bg") as f, \
562-
tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs: # fmt: skip
563-
chromsizes.to_csv(cs, sep="\t", header=False)
562+
tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs: # fmt: skip # noqa: E501
563+
pd.Series(chromsizes).to_csv(cs, sep="\t", header=False)
564564
cs.flush()
565565

566566
bg.to_csv(
567-
f.name, sep="\t", columns=columns, index=False, header=False, na_rep="nan"
567+
f.name,
568+
sep="\t",
569+
columns=columns,
570+
index=False,
571+
header=False,
572+
na_rep="nan",
568573
)
569574

570575
p = subprocess.run(
@@ -573,21 +578,27 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
573578
)
574579
return p
575580

576-
elif engine.lower() == 'bigtools':
577-
import pybigtools
581+
elif engine.lower() == "bigtools":
582+
try:
583+
import pybigtools
584+
except ImportError:
585+
raise ImportError(
586+
"pybigtools is required to use engine='bigtools'"
587+
) from None
578588

579589
f = pybigtools.open(outpath, "w")
580590
if issubclass(type(chromsizes), pd.Series):
581591
chromsizes = chromsizes.astype(int).to_dict()
582592

583-
bg = bg.astype({'chrom':str, "start": int, "end": int, value_field: float})
593+
bg = bg.astype({"chrom": str, "start": int, "end": int, value_field: float})
584594
f.write(chroms=chromsizes, vals=bg.itertuples(index=False))
585595
f.close()
586596

587597

588-
def to_bigbed(df, chromsizes, outpath, schema="bed6", path_to_binary=None):
589-
"""
590-
Save a bedGraph-like dataframe as a binary BigWig track.
598+
def to_bigbed(
599+
df, chromsizes, outpath, schema="infer", engine="ucsc", path_to_binary=None
600+
):
601+
"""Save a BED-like dataframe as a binary BigBed file.
591602
592603
Parameters
593604
----------
@@ -602,63 +613,59 @@ def to_bigbed(df, chromsizes, outpath, schema="bed6", path_to_binary=None):
602613
Select the column label of the data frame to generate the track. Default
603614
is to use the fourth column.
604615
path_to_binary : str, optional
605-
Provide system path to the bedGraphToBigWig binary.
606-
616+
Provide system path to the bedToBigBed binary.
607617
"""
618+
from bioframe.io.bed import infer_bed_schema, parse_bed_schema, to_bed_dataframe
608619

609-
if path_to_binary is None:
610-
cmd = "bedToBigBed"
611-
try:
612-
assert shutil.which(cmd) is not None
613-
except Exception:
614-
raise ValueError(
615-
"bedToBigBed is not present in the current environment. "
616-
"Pass it as 'path_to_binary' parameter to bioframe.to_bigbed or "
617-
"install it with, for example, conda install -y -c bioconda "
618-
"ucsc-bedtobigbed "
619-
) from None
620-
elif path_to_binary.endswith("bedToBigBed"):
621-
if not os.path.isfile(path_to_binary) and os.access(path_to_binary, os.X_OK):
622-
raise ValueError(
623-
f"bedToBigBed is absent in the provided path or cannot be "
624-
f"executed: {path_to_binary}. "
625-
)
626-
cmd = path_to_binary
620+
if schema == "infer":
621+
n, _ = infer_bed_schema(df)
627622
else:
628-
cmd = os.path.join(path_to_binary, "bedGraphToBigWig")
629-
if not os.path.isfile(cmd) and os.access(cmd, os.X_OK):
630-
raise ValueError(
631-
f"bedToBigBed is absent in the provided path or cannot be "
632-
f"executed: {path_to_binary}. "
623+
n, _ = parse_bed_schema(schema)
624+
625+
bed = to_bed_dataframe(df, schema=schema)
626+
m = len(bed.columns) - n
627+
schema = f"bed{n}+{m}" if m > 0 else f"bed{n}"
628+
629+
if engine.lower() == "ucsc":
630+
if path_to_binary is None:
631+
cmd = _find_ucsc_binary(path_to_binary, "bedToBigBed")
632+
633+
with tempfile.NamedTemporaryFile(suffix=".bed") as f, \
634+
tempfile.NamedTemporaryFile("wt", suffix=".chrom.sizes") as cs: # fmt: skip # noqa: E501
635+
pd.Series(chromsizes).to_csv(cs, sep="\t", header=False)
636+
cs.flush()
637+
638+
bed.to_csv(
639+
f.name,
640+
sep="\t",
641+
columns=bed.columns,
642+
index=False,
643+
header=False,
644+
na_rep="nan",
633645
)
634646

635-
is_bed6 = True
636-
for col in ["chrom", "start", "end", "name", "score", "strand"]:
637-
if col not in df.columns:
638-
is_bed6 = False
639-
if len(df.columns) < 6:
640-
is_bed6 = False
641-
642-
if not is_bed6:
643-
raise ValueError(f"A bed6-like DataFrame is required, got {df.columns}")
644-
645-
columns = ["chrom", "start", "end", "name", "score", "strand"]
646-
bed = df[columns].copy()
647-
bed["chrom"] = bed["chrom"].astype(str)
648-
bed = bed.sort_values(["chrom", "start", "end"])
649-
650-
with tempfile.NamedTemporaryFile(suffix=".bed") as f, tempfile.NamedTemporaryFile(
651-
"wt", suffix=".chrom.sizes"
652-
) as cs:
653-
chromsizes.to_csv(cs, sep="\t", header=False)
654-
cs.flush()
655-
656-
bed.to_csv(
657-
f.name, sep="\t", columns=columns, index=False, header=False, na_rep="nan"
658-
)
647+
p = subprocess.run(
648+
[cmd, f"-type={schema}", f.name, cs.name, outpath],
649+
capture_output=True,
650+
)
651+
return p
659652

660-
p = subprocess.run(
661-
[cmd, f"-type={schema}", f.name, cs.name, outpath],
662-
capture_output=True,
653+
elif engine.lower() == "bigtools":
654+
try:
655+
import pybigtools
656+
except ImportError:
657+
raise ImportError(
658+
"pybigtools is required to use engine='bigtools'"
659+
) from None
660+
661+
f = pybigtools.open(outpath, "w")
662+
if issubclass(type(chromsizes), pd.Series):
663+
chromsizes = chromsizes.astype(int).to_dict()
664+
665+
bed = bed.astype({"chrom": str, "start": int, "end": int})
666+
record_iter = (
667+
(row[0], row[1], row[2], "\t".join(str(x) for x in row[3:]))
668+
for row in bed.itertuples(index=False)
663669
)
664-
return p
670+
f.write(chroms=chromsizes, vals=record_iter)
671+
f.close()

0 commit comments

Comments
 (0)