diff --git a/ldsc_polyfun/jackknife.py b/ldsc_polyfun/jackknife.py index ef1306c..1671d3f 100644 --- a/ldsc_polyfun/jackknife.py +++ b/ldsc_polyfun/jackknife.py @@ -573,7 +573,7 @@ class Jackknife_Ridge(Jackknife): def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=True, num_lambdas=100, approx_ridge=False, ridge_lambda=None, use_1se=False, has_intercept=False, standardize=True, - skip_ridge_jackknife=True, num_chr_sets=2): + skip_ridge_jackknife=True, num_chr_sets=2, num_chr=22): #sanity checks assert chr_num is not None @@ -586,6 +586,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T self.use_1se = use_1se self.verbose=verbose self.has_intercept = has_intercept + self.num_chr = num_chr ###define chromosome sets assert num_chr_sets>1 @@ -596,8 +597,8 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T self.chromosome_sets = [] self.chromosome_sets.append(chromosomes[chromosomes%2==0]) self.chromosome_sets.append(chromosomes[chromosomes%2!=0]) - elif num_chr_sets == 22: - self.chromosome_sets = [np.array([c]) for c in range(1,23)] + elif num_chr_sets == self.num_chr: + self.chromosome_sets = [np.array([c]) for c in range(1, self.num_chr+1)] else: chr_sizes = np.bincount(chr_num)[1:] assert num_chr_sets<=len(chr_sizes) @@ -705,7 +706,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T def _divide_chromosomes_to_sets(self, chr_sizes, num_sets): chr_order = np.argsort(chr_sizes)[::-1] #np.arange(len(chr_sizes)) - chr_assignments = np.zeros(22, dtype=np.int64) - 1 + chr_assignments = np.zeros(self.num_chr, dtype=np.int64) - 1 chr_assignments[chr_order[:num_sets]] = np.arange(num_sets) set_sizes = chr_sizes[chr_order[:num_sets]].copy() for c_i in chr_order[num_sets : len(chr_sizes)]: diff --git a/ldsc_polyfun/regressions.py b/ldsc_polyfun/regressions.py index 6541c6b..b96f13a 100644 --- a/ldsc_polyfun/regressions.py +++ b/ldsc_polyfun/regressions.py @@ -152,7 +152,8 @@ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii skip_ridge_jackknife=True, num_chr_sets=2, evenodd_split=False, - nnls_exact=False + nnls_exact=False, + num_chr=22 ): for i in [y, x, w, M, N]: @@ -254,7 +255,7 @@ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii chr_num=chr_num, verbose=verbose, ridge_lambda=ridge_lambda, has_intercept=(not self.constrain_intercept), standardize=standardize_ridge, approx_ridge=approx_ridge, - skip_ridge_jackknife=skip_ridge_jackknife, num_chr_sets=num_chr_sets) + skip_ridge_jackknife=skip_ridge_jackknife, num_chr_sets=num_chr_sets, num_chr=num_chr) else: assert not loco @@ -393,7 +394,7 @@ def __init__(self, y, x, w, N, M, n_blocks=200, intercept=None, slow=False, twostep=None, old_weights=False, chr_num=None, verbose=True, approx_ridge=False, loco=False, ridge_lambda=None, standardize_ridge=True, skip_ridge_jackknife=True, keep_large=False, - num_chr_sets=22, evenodd_split=False, nn=False, nnls_exact=False): + num_chr_sets=22, evenodd_split=False, nn=False, nnls_exact=False, num_chr=22): step1_ii = None if twostep is not None: step1_ii = y < twostep @@ -411,7 +412,8 @@ def __init__(self, y, x, w, N, M, n_blocks=200, intercept=None, slow=False, evenodd_split=evenodd_split, nn=nn, keep_large=keep_large, - nnls_exact=nnls_exact + nnls_exact=nnls_exact, + num_chr=num_chr ) self.mean_chisq, self.lambda_gc = self._summarize_chisq(y) if not self.constrain_intercept: diff --git a/ldsc_polyfun/sumstats.py b/ldsc_polyfun/sumstats.py index 7c3939b..f6dc258 100644 --- a/ldsc_polyfun/sumstats.py +++ b/ldsc_polyfun/sumstats.py @@ -149,13 +149,13 @@ def _read_w_ld(args, log): def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs): - '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).''' + '''Read files split across chromosomes (annot, ref_ld, w_ld).''' try: if not_chr_arg: log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun)) out = parsefunc(_splitp(not_chr_arg), **kwargs) elif chr_arg: - f = ps.sub_chr(chr_arg, '[1-22]') + f = ps.sub_chr(chr_arg, '[1-%d]'%(_N_CHR)) log.log('Reading {N} from {F} ...'.format(F=f, N=noun)) out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs) except ValueError as e: @@ -282,6 +282,7 @@ def _read_ld_sumstats(args, log, fh, alleles=True, dropna=True): def estimate_h2(args, log): '''Estimate h2 and partitioned h2.''' args = copy.deepcopy(args) + _N_CHR = args.num_chr if args.samp_prev is not None and args.pop_prev is not None: args.samp_prev, args.pop_prev = list(map( float, [args.samp_prev, args.pop_prev])) @@ -334,7 +335,8 @@ def estimate_h2(args, log): evenodd_split=args.evenodd_split, nn=args.nn, keep_large=args.keep_large, - nnls_exact=args.nnls_exact + nnls_exact=args.nnls_exact, + num_chr=args.num_chr ) if args.print_cov: diff --git a/polyfun.py b/polyfun.py index efe84c4..e17f432 100644 --- a/polyfun.py +++ b/polyfun.py @@ -124,13 +124,13 @@ def check_files(args): if args.compute_h2_L2: if not os.path.exists(args.sumstats): raise IOError('Cannot find sumstats file %s'%(args.sumstats)) - for chr_num in range(1,23): + for chr_num in range(1, args.num_chr+1): get_file_name(args, 'ref-ld', chr_num, verify_exists=True, allow_multiple=True) get_file_name(args, 'w-ld', chr_num, verify_exists=True) get_file_name(args, 'annot', chr_num, verify_exists=True, allow_multiple=True) if args.compute_ldscores: - if args.chr is None: chr_range = range(1,23) + if args.chr is None: chr_range = range(1, args.num_chr+1) else: chr_range = range(args.chr, args.chr+1) for chr_num in chr_range: @@ -143,7 +143,7 @@ def check_files(args): get_file_name(args, 'bins', chr_num, verify_exists=True) if args.compute_h2_bins and not args.compute_ldscores: - for chr_num in range(1,23): + for chr_num in range(1, args.num_chr+1): get_file_name(args, 'w-ld', chr_num, verify_exists=True) if not args.compute_h2_L2: get_file_name(args, 'bins', chr_num, verify_exists=True) @@ -228,7 +228,8 @@ def run_ldsc(self, args, use_ridge, nn, keep_large, evenodd_split, n_blocks=2): evenodd_split=evenodd_split, nn=nn, keep_large=keep_large, - nnls_exact=args.nnls_exact + nnls_exact=args.nnls_exact, + num_chr=args.num_chr ) #save the results object @@ -309,7 +310,7 @@ def compute_snpvar_chr(self, args, chr_num, use_ridge): #make sure that the chromosome exists in one set found_chrom = np.any([chr_num in chr_set for chr_set in jknife.chromosome_sets]) if not found_chrom: - raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all 22 human chromosomes') + raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes') #find the relevant set number set_num=None @@ -326,8 +327,8 @@ def compute_snpvar_chr(self, args, chr_num, use_ridge): else: hsqhat = self.hsqhat jknife = hsqhat.jknife - if len(jknife.est_loco) != 22: - raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all 22 human chromosomes') + if len(jknife.est_loco) != args.num_chr: + raise ValueError('not all chromosomes have a taus estimate - please make sure that the intersection of SNPs with sumstats and with annotations data spans all chromosomes') taus = jknife.est_loco[chr_num-1][:hsqhat.n_annot] / hsqhat.Nbar #save the taus to disk @@ -349,7 +350,7 @@ def compute_snpvar(self, args, use_ridge): #iterate over chromosomes df_snpvar_chr_list = [] - for chr_num in tqdm(range(1,23)): + for chr_num in tqdm(range(1, args.num_chr+1)): df_snpvar_chr = self.compute_snpvar_chr(args, chr_num, use_ridge=use_ridge) df_snpvar_chr_list.append(df_snpvar_chr) df_snpvar = pd.concat(df_snpvar_chr_list, axis=0) @@ -519,7 +520,7 @@ def partition_snps_to_bins(self, args, use_ridge): def save_bins_to_disk(self, args): logging.info('Saving SNP-bins to disk') - for chr_num in tqdm(range(1,23)): + for chr_num in tqdm(range(1, args.num_chr+1)): #save bins file to disk df_bins_chr = self.df_bins.query('CHR==%d'%(chr_num)) @@ -576,7 +577,7 @@ def save_snpvar_to_disk(self, args, use_ridge, constrain_range): raise ValueError(error_message + '. If you wish to omit the missing SNPs, please use the flag --allow-missing') #iterate over chromosomes - for chr_num in tqdm(range(1,23)): + for chr_num in tqdm(range(1, args.num_chr+1)): #define output file name output_fname = 'snpvar' @@ -616,7 +617,7 @@ def load_bins_chr(self, args, chr_num): def compute_ld_scores(self, args): #define the range of chromosomes to iterate over if args.chr is None: - chr_range = range(1,23) + chr_range = range(1, args.num_chr+1) else: chr_range = range(args.chr, args.chr+1) @@ -818,6 +819,7 @@ def polyfun_main(self, args): parser.add_argument('--ld-dir', default=None, help='The path of a directory with UKB LD files (if not specified PolyFun will create a temporary directory)') parser.add_argument('--output-prefix', required=True, help='Prefix of all PolyFun output file names') parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, PolyFun will not terminate if some SNPs with sumstats are not found in the annotations files') + parser.add_argument('--num-chr', type=int, default=22, help='Number of chromosomes for your target organism (default is 22)') #LDSC parameters parser.add_argument('--nnls-exact', default=False, action='store_true', help='If specified, S-LDSC will estimate non-negative taus using an exact instead of an approximate solver (this will be slower but slightly more accurate)') diff --git a/polyloc.py b/polyloc.py index 747fc58..25df667 100644 --- a/polyloc.py +++ b/polyloc.py @@ -93,7 +93,7 @@ def check_files(args): #check that required input files exist if args.compute_ldscores or args.compute_partitions: - if args.chr is None: chr_range = range(1,23) + if args.chr is None: chr_range = range(1, args.num_chr+1) else: chr_range = range(args.chr, args.chr+1) for chr_num in chr_range: @@ -108,7 +108,7 @@ def check_files(args): get_file_name(args, 'bins', chr_num, verify_exists=True) if args.compute_polyloc: - for chr_num in range(1,23): + for chr_num in range(1, args.num_chr+1): get_file_name(args, 'w-ld', chr_num, verify_exists=True) if not args.compute_partitions: get_file_name(args, 'bins', chr_num, verify_exists=True) @@ -152,7 +152,7 @@ def polyloc_partitions(self, args): #add another partition for all SNPs not in the posterior file if args.bfile_chr is not None: df_bim_list = [] - for chr_num in range(1,23): + for chr_num in range(1, args.num_chr+1): df_bim_chr = pd.read_table(args.bfile_chr+'%d.bim'%(chr_num), sep='\s+', names=['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], header=None) df_bim_list.append(df_bim_chr) df_bim = pd.concat(df_bim_list, axis=0) @@ -328,6 +328,7 @@ def polyloc_main(self, args): parser.add_argument('--output-prefix', required=True, help='Prefix of all PolyLoc out file names') parser.add_argument('--ld-ukb', default=False, action='store_true', help='If specified, PolyLoc will use UKB LD matrices to compute LD-scores') parser.add_argument('--ld-dir', default=None, help='The path of a directory with UKB LD files (if not specified PolyLoc will create a temporary directory)') + parser.add_argument('--num-chr', type=int, default=22, help='Number of chromosomes for your target organism (default is 22)')