@@ -245,22 +245,21 @@ def read_alignments(fp, chrom=None, start=None, end=None):
245
245
raise ImportError ("pysam is required to use `read_alignments`" ) from None
246
246
247
247
ext = os .path .splitext (fp )[1 ]
248
- if ext == ' .sam' :
249
- mode = 'r'
250
- elif ext == ' .bam' :
251
- mode = 'rb'
252
- elif ext == ' .cram' :
253
- mode = 'rc'
248
+ if ext == " .sam" :
249
+ mode = "r"
250
+ elif ext == " .bam" :
251
+ mode = "rb"
252
+ elif ext == " .cram" :
253
+ mode = "rc"
254
254
else :
255
- raise ValueError (f' { ext } is not a supported filetype' )
255
+ raise ValueError (f" { ext } is not a supported filetype" )
256
256
257
257
with closing (pysam .AlignmentFile (fp , mode )) as f :
258
258
records = []
259
259
for s in f .fetch (chrom , start , end ):
260
260
# Needed because array.array is not json serializable
261
261
tags = [
262
- (k , v .tolist () if isinstance (v , array .array ) else v )
263
- for k , v in s .tags
262
+ (k , v .tolist () if isinstance (v , array .array ) else v ) for k , v in s .tags
264
263
]
265
264
records .append (
266
265
(
@@ -487,9 +486,38 @@ def read_bigbed(path, chrom, start=None, end=None, engine="auto"):
487
486
return df
488
487
489
488
490
- def to_bigwig (df , chromsizes , outpath , value_field = None , engine = 'ucsc' , path_to_binary = None ):
491
- """
492
- Save a bedGraph-like dataframe as a binary BigWig track.
489
+ def _find_ucsc_binary (path , cmd ):
490
+ if path is None :
491
+ try :
492
+ assert shutil .which (cmd ) is not None
493
+ except Exception :
494
+ raise ValueError (
495
+ f"{ cmd } is not present in the current environment. "
496
+ f"Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
497
+ f"install it with, for example, conda install -y -c bioconda "
498
+ f"ucsc-{ cmd .lower ()} "
499
+ ) from None
500
+ elif path .endswith (cmd ):
501
+ if not os .path .isfile (path ) and os .access (path , os .X_OK ):
502
+ raise ValueError (
503
+ f"{ cmd } is absent in the provided path or cannot be "
504
+ f"executed: { path } . "
505
+ )
506
+ cmd = path
507
+ else :
508
+ cmd = os .path .join (path , cmd )
509
+ if not os .path .isfile (cmd ) and os .access (cmd , os .X_OK ):
510
+ raise ValueError (
511
+ f"{ cmd } is absent in the provided path or cannot be "
512
+ f"executed: { path } . "
513
+ )
514
+ return cmd
515
+
516
+
517
+ def to_bigwig (
518
+ df , chromsizes , outpath , value_field = None , engine = "ucsc" , path_to_binary = None
519
+ ):
520
+ """Save a bedGraph-like dataframe as a binary BigWig file.
493
521
494
522
Parameters
495
523
----------
@@ -507,7 +535,6 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
507
535
Provide system path to the bedGraphToBigWig binary.
508
536
engine : {'ucsc', 'bigtools'}, optional
509
537
Engine to use for creating the BigWig file.
510
-
511
538
"""
512
539
513
540
is_bedgraph = True
@@ -528,43 +555,21 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
528
555
bg ["chrom" ] = bg ["chrom" ].astype (str )
529
556
bg = bg .sort_values (["chrom" , "start" , "end" ])
530
557
531
- if chromsizes is None :
532
- chromsizes = df .groupby ('chrom' )['end' ]
533
-
534
- if engine .lower () == 'ucsc' :
535
- if path_to_binary is None :
536
- cmd = "bedGraphToBigWig"
537
- try :
538
- assert shutil .which (cmd ) is not None
539
- except Exception :
540
- raise ValueError (
541
- "bedGraphToBigWig is not present in the current environment. "
542
- "Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or "
543
- "install it with, for example, conda install -y -c bioconda "
544
- "ucsc-bedgraphtobigwig "
545
- ) from None
546
- elif path_to_binary .endswith ("bedGraphToBigWig" ):
547
- if not os .path .isfile (path_to_binary ) and os .access (path_to_binary , os .X_OK ):
548
- raise ValueError (
549
- f"bedGraphToBigWig is absent in the provided path or cannot be "
550
- f"fexecuted: { path_to_binary } . "
551
- )
552
- cmd = path_to_binary
553
- else :
554
- cmd = os .path .join (path_to_binary , "bedGraphToBigWig" )
555
- if not os .path .isfile (cmd ) and os .access (cmd , os .X_OK ):
556
- raise ValueError (
557
- f"bedGraphToBigWig is absent in the provided path or cannot be "
558
- f"executed: { path_to_binary } . "
559
- )
558
+ if engine .lower () == "ucsc" :
559
+ cmd = _find_ucsc_binary (path_to_binary , "bedGraphToBigWig" )
560
560
561
561
with tempfile .NamedTemporaryFile (suffix = ".bg" ) as f , \
562
- tempfile .NamedTemporaryFile ("wt" , suffix = ".chrom.sizes" ) as cs : # fmt: skip
563
- chromsizes .to_csv (cs , sep = "\t " , header = False )
562
+ tempfile .NamedTemporaryFile ("wt" , suffix = ".chrom.sizes" ) as cs : # fmt: skip # noqa: E501
563
+ pd . Series ( chromsizes ) .to_csv (cs , sep = "\t " , header = False )
564
564
cs .flush ()
565
565
566
566
bg .to_csv (
567
- f .name , sep = "\t " , columns = columns , index = False , header = False , na_rep = "nan"
567
+ f .name ,
568
+ sep = "\t " ,
569
+ columns = columns ,
570
+ index = False ,
571
+ header = False ,
572
+ na_rep = "nan" ,
568
573
)
569
574
570
575
p = subprocess .run (
@@ -573,21 +578,27 @@ def to_bigwig(df, chromsizes, outpath, value_field=None, engine='ucsc', path_to_
573
578
)
574
579
return p
575
580
576
- elif engine .lower () == 'bigtools' :
577
- import pybigtools
581
+ elif engine .lower () == "bigtools" :
582
+ try :
583
+ import pybigtools
584
+ except ImportError :
585
+ raise ImportError (
586
+ "pybigtools is required to use engine='bigtools'"
587
+ ) from None
578
588
579
589
f = pybigtools .open (outpath , "w" )
580
590
if issubclass (type (chromsizes ), pd .Series ):
581
591
chromsizes = chromsizes .astype (int ).to_dict ()
582
592
583
- bg = bg .astype ({' chrom' : str , "start" : int , "end" : int , value_field : float })
593
+ bg = bg .astype ({" chrom" : str , "start" : int , "end" : int , value_field : float })
584
594
f .write (chroms = chromsizes , vals = bg .itertuples (index = False ))
585
595
f .close ()
586
596
587
597
588
- def to_bigbed (df , chromsizes , outpath , schema = "bed6" , path_to_binary = None ):
589
- """
590
- Save a bedGraph-like dataframe as a binary BigWig track.
598
+ def to_bigbed (
599
+ df , chromsizes , outpath , schema = "infer" , engine = "ucsc" , path_to_binary = None
600
+ ):
601
+ """Save a BED-like dataframe as a binary BigBed file.
591
602
592
603
Parameters
593
604
----------
@@ -602,63 +613,59 @@ def to_bigbed(df, chromsizes, outpath, schema="bed6", path_to_binary=None):
602
613
Select the column label of the data frame to generate the track. Default
603
614
is to use the fourth column.
604
615
path_to_binary : str, optional
605
- Provide system path to the bedGraphToBigWig binary.
606
-
616
+ Provide system path to the bedToBigBed binary.
607
617
"""
618
+ from bioframe .io .bed import infer_bed_schema , parse_bed_schema , to_bed_dataframe
608
619
609
- if path_to_binary is None :
610
- cmd = "bedToBigBed"
611
- try :
612
- assert shutil .which (cmd ) is not None
613
- except Exception :
614
- raise ValueError (
615
- "bedToBigBed is not present in the current environment. "
616
- "Pass it as 'path_to_binary' parameter to bioframe.to_bigbed or "
617
- "install it with, for example, conda install -y -c bioconda "
618
- "ucsc-bedtobigbed "
619
- ) from None
620
- elif path_to_binary .endswith ("bedToBigBed" ):
621
- if not os .path .isfile (path_to_binary ) and os .access (path_to_binary , os .X_OK ):
622
- raise ValueError (
623
- f"bedToBigBed is absent in the provided path or cannot be "
624
- f"executed: { path_to_binary } . "
625
- )
626
- cmd = path_to_binary
620
+ if schema == "infer" :
621
+ n , _ = infer_bed_schema (df )
627
622
else :
628
- cmd = os .path .join (path_to_binary , "bedGraphToBigWig" )
629
- if not os .path .isfile (cmd ) and os .access (cmd , os .X_OK ):
630
- raise ValueError (
631
- f"bedToBigBed is absent in the provided path or cannot be "
632
- f"executed: { path_to_binary } . "
623
+ n , _ = parse_bed_schema (schema )
624
+
625
+ bed = to_bed_dataframe (df , schema = schema )
626
+ m = len (bed .columns ) - n
627
+ schema = f"bed{ n } +{ m } " if m > 0 else f"bed{ n } "
628
+
629
+ if engine .lower () == "ucsc" :
630
+ if path_to_binary is None :
631
+ cmd = _find_ucsc_binary (path_to_binary , "bedToBigBed" )
632
+
633
+ with tempfile .NamedTemporaryFile (suffix = ".bed" ) as f , \
634
+ tempfile .NamedTemporaryFile ("wt" , suffix = ".chrom.sizes" ) as cs : # fmt: skip # noqa: E501
635
+ pd .Series (chromsizes ).to_csv (cs , sep = "\t " , header = False )
636
+ cs .flush ()
637
+
638
+ bed .to_csv (
639
+ f .name ,
640
+ sep = "\t " ,
641
+ columns = bed .columns ,
642
+ index = False ,
643
+ header = False ,
644
+ na_rep = "nan" ,
633
645
)
634
646
635
- is_bed6 = True
636
- for col in ["chrom" , "start" , "end" , "name" , "score" , "strand" ]:
637
- if col not in df .columns :
638
- is_bed6 = False
639
- if len (df .columns ) < 6 :
640
- is_bed6 = False
641
-
642
- if not is_bed6 :
643
- raise ValueError (f"A bed6-like DataFrame is required, got { df .columns } " )
644
-
645
- columns = ["chrom" , "start" , "end" , "name" , "score" , "strand" ]
646
- bed = df [columns ].copy ()
647
- bed ["chrom" ] = bed ["chrom" ].astype (str )
648
- bed = bed .sort_values (["chrom" , "start" , "end" ])
649
-
650
- with tempfile .NamedTemporaryFile (suffix = ".bed" ) as f , tempfile .NamedTemporaryFile (
651
- "wt" , suffix = ".chrom.sizes"
652
- ) as cs :
653
- chromsizes .to_csv (cs , sep = "\t " , header = False )
654
- cs .flush ()
655
-
656
- bed .to_csv (
657
- f .name , sep = "\t " , columns = columns , index = False , header = False , na_rep = "nan"
658
- )
647
+ p = subprocess .run (
648
+ [cmd , f"-type={ schema } " , f .name , cs .name , outpath ],
649
+ capture_output = True ,
650
+ )
651
+ return p
659
652
660
- p = subprocess .run (
661
- [cmd , f"-type={ schema } " , f .name , cs .name , outpath ],
662
- capture_output = True ,
653
+ elif engine .lower () == "bigtools" :
654
+ try :
655
+ import pybigtools
656
+ except ImportError :
657
+ raise ImportError (
658
+ "pybigtools is required to use engine='bigtools'"
659
+ ) from None
660
+
661
+ f = pybigtools .open (outpath , "w" )
662
+ if issubclass (type (chromsizes ), pd .Series ):
663
+ chromsizes = chromsizes .astype (int ).to_dict ()
664
+
665
+ bed = bed .astype ({"chrom" : str , "start" : int , "end" : int })
666
+ record_iter = (
667
+ (row [0 ], row [1 ], row [2 ], "\t " .join (str (x ) for x in row [3 :]))
668
+ for row in bed .itertuples (index = False )
663
669
)
664
- return p
670
+ f .write (chroms = chromsizes , vals = record_iter )
671
+ f .close ()
0 commit comments