forked from easybuilders/easybuild-easyconfigs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
alberta
committed
Apr 17, 2024
1 parent
fa12142
commit 22e1630
Showing
6 changed files
with
339 additions
and
14 deletions.
There are no files selected for viewing
167 changes: 167 additions & 0 deletions
167
easybuild/easyconfigs/h/HapCUT2/HAPcut-v1.3.3-Enable_MI_Tag.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
From 0a1706ee3a4ca170e51d426595002b7994e96751 Mon Sep 17 00:00:00 2001 | ||
From: Pontus Hojer <pontushojer@gmail.com> | ||
Date: Tue, 8 Jun 2021 10:47:48 +0200 | ||
Subject: [PATCH] Enable the use of the MI tag to detect molecules | ||
|
||
--- | ||
utilities/LinkFragments.py | 105 ++++++++++++++++++++++++++++++++++--- | ||
1 file changed, 99 insertions(+), 6 deletions(-) | ||
|
||
diff --git a/utilities/LinkFragments.py b/utilities/LinkFragments.py | ||
index e425d8f..ad8279b 100644 | ||
--- a/utilities/LinkFragments.py | ||
+++ b/utilities/LinkFragments.py | ||
@@ -13,10 +13,10 @@ | ||
import pysam | ||
import argparse | ||
import os | ||
+from math import inf | ||
|
||
barcode_tag = 'BX' | ||
|
||
- | ||
############################################################################### | ||
############################################################################### | ||
# these functions pulled from "getMolecules" script: | ||
@@ -121,7 +121,93 @@ def get_gemcode_regions(ibam, dist): | ||
max([end for chr, pos, end in gemcodes[barcode]]), | ||
barcode, len(gemcodes[barcode])) | ||
|
||
-def get_molecules(bam,ref=None,dist=20000): | ||
+ | ||
+def get_gemcode_regions_from_tag(ibam): | ||
+ """ | ||
+ Get molecule coordinates from reads that share BX and MI tags. | ||
+ | ||
+ Parameters | ||
+ ---------- | ||
+ ibam : pysam.AlignmentFile | ||
+ Input 10X bam with set MI and BX tags | ||
+ | ||
+ Yields | ||
+ ------ | ||
+ region : namedtuple | ||
+ chr, start, end, barcode, readcount | ||
+ """ | ||
+ # Create defaultdict for storing molecules | ||
+ # Key is gemcodes + molecule id, value is Molecules instance | ||
+ molecules = defaultdict(Molecule) | ||
+ | ||
+ # Set current_chr as reference contig of first read | ||
+ current_chr = None | ||
+ | ||
+ # Iterate over reads in bamfile | ||
+ for read in ibam: | ||
+ | ||
+ if (read.mapq < min_mapq or read.is_unmapped or read.is_duplicate or read.is_secondary or | ||
+ read.is_qcfail): | ||
+ continue | ||
+ | ||
+ assert(read.reference_name is not None) | ||
+ assert(read.reference_start is not None) | ||
+ assert(read.reference_end is not None) | ||
+ | ||
+ if not read.has_tag('BX') or not read.has_tag('MI'): | ||
+ continue | ||
+ | ||
+ gem = read.get_tag('BX') | ||
+ molecule_id = read.get_tag('MI') | ||
+ | ||
+ # Combined gem/barcode and molcule id for storing molecules | ||
+ gem_molecule = (gem, molecule_id) | ||
+ | ||
+ # If the read is from a new contig/chromosome, write out all molecules | ||
+ # in memory | ||
+ if read.reference_name != current_chr and current_chr is not None: | ||
+ for _, molecule in molecules.items(): | ||
+ yield molecule | ||
+ | ||
+ molecules.clear() | ||
+ | ||
+ # Add read to molecules | ||
+ molecules[gem_molecule].add(chr=read.reference_name, | ||
+ start=read.reference_start, | ||
+ end=read.reference_end, | ||
+ barcode=gem) | ||
+ | ||
+ # Save read contig as current_chr | ||
+ current_chr = read.reference_name | ||
+ | ||
+ # Write out all remaining molecules at end of bam | ||
+ for _, molecule in molecules.items(): | ||
+ yield molecule | ||
+ | ||
+ | ||
+class Molecule: | ||
+ def __init__(self): | ||
+ self.chr = None | ||
+ self.start = inf | ||
+ self.end = 0 | ||
+ self.barcode = None | ||
+ self.readcount = 0 | ||
+ | ||
+ def add(self, chr, start, end, barcode): | ||
+ self.set_if_none("chr", chr) | ||
+ self.start = min(self.start, start) | ||
+ self.end = max(self.end, end) | ||
+ self.set_if_none("barcode", barcode) | ||
+ self.readcount += 1 | ||
+ | ||
+ def set_if_none(self, attr, value): | ||
+ if getattr(self, attr) is None: | ||
+ setattr(self, attr, value) | ||
+ else: | ||
+ assert getattr(self, attr) == value | ||
+ | ||
+ | ||
+def get_molecules(bam,ref=None,dist=20000, use_tag=False): | ||
|
||
#Open outfile | ||
bamf = pysam.AlignmentFile(bam,"rb"); | ||
@@ -131,7 +217,12 @@ def get_molecules(bam,ref=None,dist=20000): | ||
ibam = bamf.fetch(reference=ref) | ||
|
||
#Get gemcode regions | ||
- for bed in get_gemcode_regions(ibam, dist): | ||
+ if use_tag: | ||
+ regions = get_gemcode_regions_from_tag(ibam) | ||
+ else: | ||
+ regions = get_gemcode_regions(ibam, dist) | ||
+ | ||
+ for bed in regions: | ||
#Turn molecule object into string | ||
yield (bed.chr, bed.start, bed.end, bed.barcode) | ||
|
||
@@ -290,7 +381,7 @@ def parse_bedfile(input_file): | ||
barcode = el[3] | ||
yield (chrom, start, stop, barcode) | ||
|
||
-def link_fragments(hairs_file, vcf_file, bam_file, outfile, dist, single_SNP_frags,maxbq): | ||
+def link_fragments(hairs_file, vcf_file, bam_file, outfile, dist, single_SNP_frags,maxbq=40, use_tag=False): | ||
|
||
lines = [] | ||
|
||
@@ -338,7 +429,7 @@ def link_fragments(hairs_file, vcf_file, bam_file, outfile, dist, single_SNP_fra | ||
dup_snp_cover = 0 | ||
|
||
# start is 0-indexed start point and stop is 1 past the last residue, 0-indexed | ||
- for chrom,start,stop,barcode in get_molecules(bam_file, curr_chrom, dist=dist): | ||
+ for chrom,start,stop,barcode in get_molecules(bam_file, curr_chrom, dist=dist, use_tag=use_tag): | ||
#print("molecule: {} {} {} {}".format(chrom,start,stop,barcode)) | ||
if chrom != curr_chrom: | ||
continue | ||
@@ -469,6 +560,7 @@ def parseargs(): | ||
parser.add_argument('-o', '--outfile', nargs='?', type = str, help='output file with linked fragments') | ||
parser.add_argument('-d', '--distance', nargs='?', type = int, help='distance in base pairs that delineates separate 10X molecules, default=20kb',default=20000) | ||
parser.add_argument('-m', '--maxbq', nargs='?', type = int, help='maximum base quality for an allele call, default=40',default=40) | ||
+ parser.add_argument('--use-tag', action="store_true", default=False, help=f'use molecule tag (MI) to separate between molecules') | ||
|
||
parser.add_argument('-s', '--single_SNP_frags', action='store_true', help='whether to keep fragments overlapping only one SNP', default=False) | ||
|
||
@@ -484,4 +576,5 @@ def parseargs(): | ||
# parse input and run function to call alleles | ||
if __name__ == '__main__': | ||
args = parseargs() | ||
- link_fragments(args.fragments,args.VCF,args.bam_file, args.outfile, args.distance, args.single_SNP_frags,args.maxbq) | ||
+ link_fragments(args.fragments,args.VCF,args.bam_file, args.outfile, args.distance, args.single_SNP_frags,args.maxbq, | ||
+ args.use_tag) |
23 changes: 23 additions & 0 deletions
23
easybuild/easyconfigs/h/HapCUT2/HAPcut-v1.3.3-Fix-Makefile00.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
From 96f924e18d70add1841fe5b11c60e8d8d2a872b2 Mon Sep 17 00:00:00 2001 | ||
From: snehring <snehring@iastate.edu> | ||
Date: Wed, 27 Apr 2022 11:29:07 -0500 | ||
Subject: [PATCH] Makefile changes | ||
|
||
--- | ||
Makefile | 3 +-- | ||
1 file changed, 1 insertion(+), 2 deletions(-) | ||
|
||
diff --git a/Makefile b/Makefile | ||
index c473df9..b0d754d 100644 | ||
--- a/Makefile | ||
+++ b/Makefile | ||
@@ -2,8 +2,7 @@ | ||
# HAPCUT2 MAKEFILE | ||
default: all | ||
|
||
-CC=gcc -g -O3 -Wall -D_GNU_SOURCE | ||
-CFLAGS=-Wall | ||
+CFLAGS=-Wall -g -O3 -Wall -D_GNU_SOURCE | ||
|
||
# DIRECTORIES | ||
B=build |
File renamed without changes.
103 changes: 103 additions & 0 deletions
103
easybuild/easyconfigs/h/HapCUT2/HAPcut-v1.3.3-Fix_Add_install_script.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
From c8fa7a482f8422aa7e3c03bf5a483bf3f485b7f0 Mon Sep 17 00:00:00 2001 | ||
From: Vikas Bansal <vikas115@gmail.com> | ||
Date: Mon, 31 Jan 2022 13:46:05 -0800 | ||
Subject: [PATCH] - added script install.sh to install htslib (from source) and | ||
hapcut2 automatically | ||
|
||
- path to htslib changed to htslib | ||
--- | ||
Makefile | 4 ++-- | ||
hairs-src/readvariant.c | 1 + | ||
hapcut2-src/hic.c | 6 +++++- | ||
hapcut2-src/optionparser.c | 2 +- | ||
install.sh | 10 ++++++++++ | ||
5 files changed, 19 insertions(+), 4 deletions(-) | ||
create mode 100644 install.sh | ||
|
||
diff --git a/Makefile b/Makefile | ||
index dd185cd..c473df9 100644 | ||
--- a/Makefile | ||
+++ b/Makefile | ||
@@ -9,7 +9,7 @@ CFLAGS=-Wall | ||
B=build | ||
H=hairs-src | ||
X=hapcut2-src | ||
-HTSLIB=/usr/common/src/htslib #path/to/htslib/ | ||
+HTSLIB=htslib #/usr/common/src/htslib #path/to/htslib/ | ||
T=test | ||
# below is the path to CUnit directory, change if need be | ||
CUNIT=/usr/include/CUnit | ||
@@ -19,7 +19,7 @@ all: $(B)/extractHAIRS $(B)/HAPCUT2 | ||
|
||
#temporarily removed -O2 flag after -I$(HTSLIB) | ||
$(B)/extractHAIRS: $(B)/bamread.o $(B)/hashtable.o $(B)/readvariant.o $(B)/readfasta.o $(B)/hapfragments.o $(H)/extracthairs.c $(H)/parsebamread.c $(H)/realignbamread.c $(H)/nw.c $(H)/realign_pairHMM.c $(H)/estimate_hmm_params.c | $(B) | ||
- $(CC) $(CFLAGS) $(LDFLAGS) -I$(HTSLIB) -g $(B)/bamread.o $(B)/hapfragments.o $(B)/hashtable.o $(B)/readfasta.o $(B)/readvariant.o -o $(B)/extractHAIRS $(H)/extracthairs.c -pthread -lhts -lm -lz -lcurl -lcrypto -llzma -lbz2 | ||
+ $(CC) $(CFLAGS) $(LDFLAGS) -I$(HTSLIB) -g $(B)/bamread.o $(B)/hapfragments.o $(B)/hashtable.o $(B)/readfasta.o $(B)/readvariant.o -o $(B)/extractHAIRS $(H)/extracthairs.c -pthread -lhts -lm -lz -lcurl -llzma -lbz2 | ||
#temporarily removed -O2 flag after -I$(HTSLIB) | ||
|
||
$(B)/hapfragments.o: $(H)/hapfragments.c $(H)/hapfragments.h $(H)/readvariant.h | $(B) | ||
diff --git a/hairs-src/readvariant.c b/hairs-src/readvariant.c | ||
index 04fe6ba..0a95d29 100644 | ||
--- a/hairs-src/readvariant.c | ||
+++ b/hairs-src/readvariant.c | ||
@@ -93,6 +93,7 @@ int parse_variant(VARIANT* variant, char* buffer, int samplecol) | ||
// int altalleles = splitString(variant->AA,',',alist); | ||
|
||
int gl = strlen(variant->genotype); | ||
+ //fprintf(stderr,"variant genotype %d %s \n",variant->position,variant->genotype); | ||
// check that the genotype field is diploid | ||
if ((gl >= 3 && (variant->genotype[1] == '/' || variant->genotype[1] == '|')) && | ||
(gl == 3 || variant->genotype[3] == ':') && | ||
diff --git a/hapcut2-src/hic.c b/hapcut2-src/hic.c | ||
index fc85142..2036dde 100644 | ||
--- a/hapcut2-src/hic.c | ||
+++ b/hapcut2-src/hic.c | ||
@@ -3,6 +3,10 @@ | ||
float HIC_EM_THRESHOLD = 0.99; | ||
/* | ||
functions that are specific to processing of Hi-C reads for phasing: estimating h-trans error rates and I/O | ||
+HTRANS_BINSIZE = 5000; | ||
+HTRANS_MAXBINS = 10000; // this value will be overwritten at startup | ||
+HTRANS_READ_LOWBOUND = 500; | ||
+HTRANS_MAX_WINDOW = 4000000; // maximum window size for h-trans estimation | ||
*/ | ||
|
||
int count_htrans_bins(char* htrans_file) { | ||
@@ -169,7 +173,7 @@ int estimate_htrans_probs(struct fragment* Flist, int fragments, char* HAP, stru | ||
} | ||
} | ||
|
||
- // using neighboring bins to calculate mean htrans for each bin | ||
+ // using neighboring bins to calculate mean htrans for each bin if the number of reads in bin is less < 500 (READ_LOWBOUND) | ||
for (i = 0; i < HTRANS_MAXBINS; i++){ | ||
adj_MLE_count[i] = MLE_count[i]; | ||
adj_MLE_sum[i] = MLE_sum[i]; | ||
diff --git a/hapcut2-src/optionparser.c b/hapcut2-src/optionparser.c | ||
index 8308ec3..6674956 100644 | ||
--- a/hapcut2-src/optionparser.c | ||
+++ b/hapcut2-src/optionparser.c | ||
@@ -200,7 +200,7 @@ void print_hapcut_options() { | ||
//fprintf(stderr, "--split_threshold, --st <float>: PHRED SCALED threshold for splitting blocks (range 0-100, larger values split more). default: 0\n"); | ||
fprintf(stderr, "--call_homozygous, --ch <0/1>: call positions as homozygous if they appear to be false heterozygotes. default: 0\n"); | ||
fprintf(stderr, "--discrete_pruning, --dp <0/1>: use discrete heuristic to prune SNPs. default: 0\n"); | ||
- fprintf(stderr, "--error_analysis_mode, --ea <0/1>: compute switch confidence scores and print to haplotype file but don't split blocks or prune. default: 0\n"); | ||
+ //fprintf(stderr, "--error_analysis_mode, --ea <0/1>: compute switch confidence scores and print to haplotype file but don't split blocks or prune. default: 0\n"); | ||
|
||
fprintf(stderr, "\nAdvanced Options:\n"); | ||
fprintf(stderr, "--new_format, --nf <0/1>: use new Hi-C fragment matrix file format (but don't do h-trans error modeling). default: 0\n"); | ||
diff --git a/install.sh b/install.sh | ||
new file mode 100644 | ||
index 0000000..629ebad | ||
--- /dev/null | ||
+++ b/install.sh | ||
@@ -0,0 +1,10 @@ | ||
+git clone https://github.com/vibansal/HapCUT2.git | ||
+cd hapcut2 | ||
+git clone https://github.com/samtools/htslib.git | ||
+cd htslib | ||
+autoreconf -i | ||
+./configure | ||
+make | ||
+cd .. | ||
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$(pwd)/htslib" | ||
+make |
36 changes: 36 additions & 0 deletions
36
easybuild/easyconfigs/h/HapCUT2/HAPcut-v1.3.3-Fix_build_errors.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
From 65771565f29aa61f1fa7e8319a39c8a65cc19824 Mon Sep 17 00:00:00 2001 | ||
From: snehring <snehring@iastate.edu> | ||
Date: Wed, 27 Apr 2022 11:28:35 -0500 | ||
Subject: [PATCH] fixing build errors with newer gcc versions | ||
|
||
--- | ||
hairs-src/hashtable.c | 2 +- | ||
hairs-src/hashtable.h | 2 +- | ||
2 files changed, 2 insertions(+), 2 deletions(-) | ||
|
||
diff --git a/hairs-src/hashtable.c b/hairs-src/hashtable.c | ||
index d221547..b4a1c58 100644 | ||
--- a/hairs-src/hashtable.c | ||
+++ b/hairs-src/hashtable.c | ||
@@ -39,7 +39,7 @@ int insert_keyvalue(HASHTABLE* ht, char* key, int slen, int value) { | ||
|
||
int getindex(HASHTABLE* ht, char* chrom) { | ||
int hash = hashstring(chrom, ht->htsize); | ||
- keypointer = ht->blist[hash]; | ||
+ keyvalue* keypointer = ht->blist[hash]; | ||
while (keypointer != NULL) { | ||
if (strcmp(keypointer->key, chrom) == 0) return keypointer->value; | ||
keypointer = keypointer->next; | ||
diff --git a/hairs-src/hashtable.h b/hairs-src/hashtable.h | ||
index 1cb566b..3153da2 100644 | ||
--- a/hairs-src/hashtable.h | ||
+++ b/hairs-src/hashtable.h | ||
@@ -13,7 +13,7 @@ typedef struct keyvalue { | ||
struct keyvalue* next; | ||
} keyvalue; | ||
|
||
-keyvalue* keypointer; | ||
+extern keyvalue* keypointer; | ||
|
||
typedef struct HASHTABLE { | ||
int htsize; // prime number that is also the size of HASHTABLE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters