diff --git a/COPYRIGHT b/COPYRIGHT index 9fe6926..500f8ee 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1 +1 @@ -Copyright 2016, 2017 Lingfei Wang +Copyright 2016-2018 Lingfei Wang diff --git a/Makefile b/Makefile index fea521b..8d26b02 100644 --- a/Makefile +++ b/Makefile @@ -15,11 +15,10 @@ URL_BIN_REL="$(URL_BIN)/releases" URL_R_REL="$(URL_R)/releases" VERSION1=1 VERSION2=0 -VERSION3=5 +VERSION3=6 LICENSE=AGPL-3 LICENSE_FULL="GNU Affero General Public License, Version 3" LICENSE_URL="https://www.gnu.org/licenses/agpl-3.0" -#UNAME=$$(uname) ifdef INCLUDE_MAKEFILE_BEFORE #Input package info here include $(INCLUDE_MAKEFILE_BEFORE) @@ -39,7 +38,8 @@ DIR_SRC=. endif DIR_INSTALL_PREFIX=$(PREFIX) DIR_INSTALL_LIB=$(DIR_INSTALL_PREFIX)/lib -DIR_INSTALL_INC=$(DIR_INSTALL_PREFIX)/include/$(LIB_NAME) +DIR_INSTALL_INC0=$(DIR_INSTALL_PREFIX)/include +DIR_INSTALL_INC=$(DIR_INSTALL_INC0)/$(LIB_NAME) CC=gcc CFLAGSI=$(addprefix -I ,. $(R_INCLUDE_DIR) $(PREFIX)/include /usr/local/include) @@ -66,10 +66,24 @@ INC_INSTALL_FILES=$(LIB_H) INC_INSTALL_DIRS=$(dir $(LIB_H)) LIB_UNINSTALL=$(addprefix $(DIR_INSTALL_LIB)/,$(notdir $(LIB_DPRODUCT))) INC_UNINSTALL=$(DIR_INSTALL_INC) +PKGCONFIG=$(LIB_NAME).pc +PKGCONFIG_UNINSTALL=$(DIR_INSTALL_LIB)/pkgconfig/$(LIB_NAME).pc .PHONY: all clean distclean install-lib install-inc install uninstall -all: $(LIB_DPRODUCT) +all: $(LIB_DPRODUCT) $(PKGCONFIG) + +$(PKGCONFIG): + @echo "prefix=$(DIR_INSTALL_PREFIX)" > $@ + @echo "exec_prefix=$(DIR_INSTALL_PREFIX)" >> $@ + @echo "libdir=$(DIR_INSTALL_LIB)" >> $@ + @echo "includedir=$(DIR_INSTALL_INC0)" >> $@ + @echo >> $@ + @echo "Name: $(LIB_NAME)" >> $@ + @echo "Description: Fast Inference of Networks from Directed Regulations" >> $@ + @echo "Version: $(VERSION1).$(VERSION2).$(VERSION3)" >> $@ + @echo "Libs: -L$(DIR_INSTALL_LIB) -l$(LIB_NAME) -lgsl" >> $@ + @echo "Cflags: -I$(DIR_INSTALL_INC0)" >> $@ $(LIB_CONFIG): @echo "#ifndef _HEADER_LIB_CONFIG_AUTO_H_" > $@ @@ -100,23 +114,14 @@ clean: $(RM) $(LIB_PRODUCT) distclean: clean - $(RM) $(LIB_DPRODUCT) - $(RM) $(LIB_CONFIG) - $(RM) Makefile.flags $(TMP_FILE) + $(RM) $(LIB_DPRODUCT) $(PKGCONFIG) $(LIB_CONFIG) Makefile.flags $(TMP_FILE) install-lib: SHELL:=/bin/bash install-lib: all umask 0022 && mkdir -p $(DIR_INSTALL_LIB) && \ cp $(LIB_DPRODUCT) $(DIR_INSTALL_LIB)/ && \ chmod 0755 $(DIR_INSTALL_LIB)/$(notdir $(LIB_DPRODUCT)) && \ - if [ "$$(uname)" == "Linux" ]; then \ - ldconfig $(DIR_INSTALL_LIB) || true; \ - fi; \ - tcyg=$$(uname | grep -io "cygwin"); \ - if [ -n "$$tcyg" ]; then \ - ln -s $(DIR_INSTALL_LIB)/$(notdir $(LIB_DPRODUCT)) $(basename $(DIR_INSTALL_LIB)/$(notdir $(LIB_DPRODUCT))).dll; \ - fi - + ldconfig $(DIR_INSTALL_LIB) || true install-inc: SHELL:=/bin/bash install-inc: $(LIB_CONFIG) @@ -130,10 +135,15 @@ install-inc: $(LIB_CONFIG) chmod 0644 $(DIR_INSTALL_INC)/$$fname || exit 1; \ done -install: install-lib install-inc +install-pkgconfig: $(PKGCONFIG) + umask 0022 && mkdir -p $(DIR_INSTALL_LIB)/pkgconfig && \ + cp $< $(DIR_INSTALL_LIB)/pkgconfig/ + chmod 0644 $(DIR_INSTALL_LIB)/pkgconfig/$(notdir $<) + +install: install-lib install-inc install-pkgconfig uninstall: - $(RM) -R $(LIB_UNINSTALL) $(INC_UNINSTALL) + $(RM) -R $(LIB_UNINSTALL) $(INC_UNINSTALL) $(PKGCONFIG_UNINSTALL) TMP_FILE=.tmp Makefile.flags: diff --git a/UPDATES b/UPDATES index 74bb24d..d6dd91f 100644 --- a/UPDATES +++ b/UPDATES @@ -1,8 +1,15 @@ +1.0.6: + Corrected a bug that may produce biased output in pij_gassist, pij_gassist_trad, and pijs_gassist when nodiag is set or when memlimit is small so computation is split into chunks. + Now setting histogram bounds based on the maximum of all LLRs (as opposed to the maximum of the chunk when memlimit is small) in pij_gassist, pij_gassist_trad, and pijs_gassist. This ensures the output is independent of memlimit (related to question from sritchie73@github). + Added sanity checks for agreement between input data and nodiag flag for pij functions excluding _pv (suggested by sritchie73@github). + Lots of internal function renaming and code restructuring. + Removed some unneeded files and functions. + Now support pkg-config setup. 1.0.5: - Updated Makefiles to account for different make environments. + Updated Makefiles to account for different make environments (reported by sritchie73@github). 1.0.1: Bug correction: - Updated LDFLAGS for R interface. + Updated LDFLAGS for R interface (reported by audreyqyfu@github). 1.0.0: New functions: Included P-value computation for 4 tests in pijs_gassist_pv and correlation test in pij_rank_pv. diff --git a/base/config.h b/base/config.h index 0f7f6ca..5b5781f 100644 --- a/base/config.h +++ b/base/config.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/const.h b/base/const.h index c251a70..1fea54a 100644 --- a/base/const.h +++ b/base/const.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/data_process.c b/base/data_process.c index f085e35..a8f6027 100644 --- a/base/data_process.c +++ b/base/data_process.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -18,6 +18,7 @@ #include "config.h" #include #include +#include #include #include #include "types.h" @@ -435,7 +436,72 @@ void MATRIXFF(minmax_nodiag)(const MATRIXF* d,FTYPE* restrict dmin,FTYPE* restri } } +/* Quick hash of VECTORF using bitwise operation + * Return: hash of VECTORF as FTYPE + */ +static inline FTYPE VECTORFF(hash)(const VECTORF* v) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#define TFUTYPE CONCATENATE3(uint,FTYPEBITS,_t) + TFUTYPE h; + size_t i; + + h=*(const TFUTYPE*)VECTORFF(const_ptr)(v,0); + for(i=1;isize;i++) + { + h=(h<<1)|(h>>(FTYPEBITS-1)); + h^=*(const TFUTYPE*)VECTORFF(const_ptr)(v,i); + } + return *(FTYPE*)(&h); +} + +int MATRIXFF(cmprow)(const MATRIXF* m1,const MATRIXF* m2,VECTORF* buff1,VECTORF* buff2,char nodiag,char warn) +{ + size_t i,j; + size_t ng,nt; + union u + { + FTYPE f; + TFUTYPE u; + } t; + FTYPE fmin,fmax; + int test; + ng=m1->size1; + nt=m2->size1; + assert((m2->size2==m1->size2)&&(buff->size1==ng)&&(buff1->size==ng)&&(buff2->size==m1->size2)); + + for(i=0;isize;i++) + if(VECTORFF(get)(v,i)==vold) + VECTORFF(set)(v,i,vnew); +} static inline void MATRIXFF(cov1)(const MATRIXF* m,MATRIXF* cov) { diff --git a/base/data_struct.h b/base/data_struct.h index 2bb5d7b..0071203 100644 --- a/base/data_struct.h +++ b/base/data_struct.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/data_struct_heap.c b/base/data_struct_heap.c index 254581c..ff46c6b 100644 --- a/base/data_struct_heap.c +++ b/base/data_struct_heap.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/data_struct_heap.h b/base/data_struct_heap.h index 6eb777d..04fd8f7 100644 --- a/base/data_struct_heap.h +++ b/base/data_struct_heap.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/data_struct_ll.c b/base/data_struct_ll.c index 6d85ce4..45de3da 100644 --- a/base/data_struct_ll.c +++ b/base/data_struct_ll.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/data_struct_ll.h b/base/data_struct_ll.h index c098c26..946e38b 100644 --- a/base/data_struct_ll.h +++ b/base/data_struct_ll.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/general_alg.c b/base/general_alg.c deleted file mode 100644 index 61d7436..0000000 --- a/base/general_alg.c +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "config.h" -#include -#include -#include "logger.h" -#include "macros.h" -#include "general_alg.h" - -size_t break_string_into_substrings(char** s,const char* restrict sep,size_t nmax) -{ - size_t i,l,n; - - l=strlen(s[0]); - for(i=0,n=1;i=l) - break; - if(n>=nmax) - return nmax+1; - s[0][i++]=0; - i+=strspn(s[0]+i,sep); - if(i>=l) - break; - s[n++]=s[0]+i; - } - return n; -} - -long count_instance_fromfile(FILE* f,size_t size,size_t num,int (*func)(const void*)) -{ -#define CLEANUP AUTOFREE(dat) - size_t i,n; - AUTOALLOC(char,dat,size,10000) - size_t sret; - - if(!dat) - { - LOG(3,"Can't allocate memory of size "PRINTFSIZET".",size) - return -1; - } - for(i=0,n=0;i nmax, the last string s[nmax-1] contains - * separators because insufficient breaking. - */ -size_t break_string_into_substrings(char** s,const char* restrict sep,size_t nmax); - -/* Counts the number of instances that gives the criterion function a return value of true. - * Parameters: - * inst: the pointer to the first item of the instance array - * size: size of one instance - * num: the number of instances to be tested - * func: the criterion function to test the instances. - * Return: the number of instances that gives true for the function func, for a total of num - * instances, starting from inst, each of which has size bytes. - */ -static inline size_t count_instance(const void* inst,size_t size,size_t num,int (*func)(const void*)); - -/* Similar with above (count_instance), but processes a file handle. - * Return: the number of instances satisfying the criterion function func if succeed, - * or -1 if fail. - */ -long count_instance_fromfile(FILE* f,size_t size,size_t num,int (*func)(const void*)); - -/* Search for left insert position of item in ascending array. - * x: item to search for position - * a: ascending array - * n: size of a - * Return: left insert position of x (=0,...,n) - */ -static inline size_t bsearch_d(double x,const double* restrict a,size_t n); - /* Categorize data according to categorical information into separate arrays. * s: Source of data. * c: Categorical information. Each element contains a category of the corresponding element of s. @@ -96,31 +59,6 @@ static inline size_t remove_sorted_duplicates(double* restrict a,size_t n); -static inline size_t count_instance(const void* inst,size_t size,size_t num,int (*func)(const void*)) -{ - size_t i,n; - for(i=0,n=0;i=x) - end=mid; - else if(a[mid]=n if value outside range - */ -static inline size_t histogram_c2d(double val,const double* restrict range,size_t n); - /* Construct finer histogram ranges from existing sparse one. * Each bin is evenly split into n bins. * hnew[i*n+j]=(hold[i]*(n-j)+hold[i+1]*j)/n @@ -234,12 +226,10 @@ static inline void histogram_equalbins_fromnullcdf(size_t n,double left,double r math_cdf_quantile(n,left,right,func,param,eps,ans); } -static inline size_t histogram_c2d(double val,const double* restrict range,size_t n) -{ - if((valrange[n])) - return n; - return bsearch_d(val,(double*)range+1,n-1); -} + + + + diff --git a/base/lib.c b/base/lib.c index 3634d08..6feaa8a 100644 --- a/base/lib.c +++ b/base/lib.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/lib.h b/base/lib.h index 00d0088..244b531 100644 --- a/base/lib.h +++ b/base/lib.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/logger.c b/base/logger.c index 48e57ca..5c1c03f 100644 --- a/base/logger.c +++ b/base/logger.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/logger.h b/base/logger.h index 079a039..e4d6437 100644 --- a/base/logger.h +++ b/base/logger.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/macros.h b/base/macros.h index 712e447..35e6279 100644 --- a/base/macros.h +++ b/base/macros.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/math.c b/base/math.c index 7314fd1..ffe9eca 100644 --- a/base/math.c +++ b/base/math.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/math.h b/base/math.h index ba3e349..a421967 100644 --- a/base/math.h +++ b/base/math.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/os.h b/base/os.h index 3fb921c..23c2d7a 100644 --- a/base/os.h +++ b/base/os.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/random.c b/base/random.c index bb612f6..e918742 100644 --- a/base/random.c +++ b/base/random.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/random.h b/base/random.h index 3c4f4f9..78cf7fc 100644 --- a/base/random.h +++ b/base/random.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/supernormalize.c b/base/supernormalize.c index e8a9e27..0cd1698 100644 --- a/base/supernormalize.c +++ b/base/supernormalize.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/supernormalize.h b/base/supernormalize.h index 99b1cf7..5662e93 100644 --- a/base/supernormalize.h +++ b/base/supernormalize.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/threading.h b/base/threading.h index 6679587..bee398a 100644 --- a/base/threading.h +++ b/base/threading.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/base/types.h b/base/types.h index 4211dbf..d06b706 100644 --- a/base/types.h +++ b/base/types.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/cycle/cycle.h b/cycle/cycle.h index f7ae596..5b7f4c6 100644 --- a/cycle/cycle.h +++ b/cycle/cycle.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/cycle/vg.c b/cycle/vg.c index 3b7c30a..d0bab2b 100644 --- a/cycle/vg.c +++ b/cycle/vg.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/cycle/vg.h b/cycle/vg.h index 1127e1e..e3c1e51 100644 --- a/cycle/vg.h +++ b/cycle/vg.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/doc.pdf b/doc.pdf index 9eadc5d..22c7dfa 100644 Binary files a/doc.pdf and b/doc.pdf differ diff --git a/external/R.c b/external/R.c index 6ae9265..2c82e61 100644 --- a/external/R.c +++ b/external/R.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/netr/one.c b/netr/one.c index c2c5c74..0165897 100644 --- a/netr/one.c +++ b/netr/one.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/netr/one.h b/netr/one.h index 2a74782..1866b56 100644 --- a/netr/one.h +++ b/netr/one.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/pij/cassist/cassist.c b/pij/cassist/cassist.c index dfd33a5..c9026b6 100644 --- a/pij/cassist/cassist.c +++ b/pij/cassist/cassist.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -24,6 +24,7 @@ #include "../../base/const.h" #include "../../base/supernormalize.h" #include "../../base/threading.h" +#include "../../base/data_process.h" #include "llr.h" #include "llrtopij.h" #include "llrtopv.h" @@ -96,7 +97,7 @@ int pijs_cassist_pv(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* #undef CLEANUP } -int pijs_cassist_a(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,char nodiag,size_t memlimit) +int pijs_cassist(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,char nodiag,size_t memlimit) { #define CLEANUP CLEANMATF(gnew)CLEANMATF(tnew)CLEANMATF(tnew2) MATRIXF *gnew; //(ng,ns) Supernormalized transcript matrix @@ -140,6 +141,13 @@ int pijs_cassist_a(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* if(!(gnew&&tnew&&tnew2)) ERRRET("Not enough memory.") + //Check for identical rows in input data + { + VECTORFF(view) vbuff1=MATRIXFF(column)(tnew,0); + VECTORFF(view) vbuff2=MATRIXFF(row)(tnew2,0); + MATRIXFF(cmprow)(t,t2,&vbuff1.vector,&vbuff2.vector,nodiag,1); + } + //Step 1: Supernormalization LOG(9,"Supernormalizing...") MATRIXFF(memcpy)(gnew,g); @@ -155,7 +163,7 @@ int pijs_cassist_a(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* LOG(9,"Calculating real log likelihood ratios...") pij_cassist_llr(gnew,tnew,tnew2,p1,p2,p3,p4,p5); //Step 3: Convert log likelihood ratios to probabilities - if((ret=pij_cassist_llrtopijs_a(p1,p2,p3,p4,p5,ns,nodiag))) + if((ret=pij_cassist_llrtopijs(p1,p2,p3,p4,p5,ns,nodiag))) LOG(4,"Failed to convert all log likelihood ratios to probabilities.") if(nodiag) { @@ -175,17 +183,7 @@ int pijs_cassist_a(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* #undef CLEANUP } -int pijs_cassist(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,char nodiag,size_t memlimit) -{ - return pijs_cassist_a(g,t,t2,p1,p2,p3,p4,p5,nodiag,memlimit); -} - -/* Estimates the probability p(E->A->B) from genotype and expression data. Combines results - * from any pij_cassist_pijs. For more information, see pij_cassist_pijs_ab. - * ans: (ng,nt) Output matrix for probabilities. ans[A,B] is p(E->A->B). - * pijs: Function to calculate pijs. - */ -static int pij_cassist_any(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,char nodiag,int (*pijs)(const MATRIXF*,const MATRIXF*,const MATRIXF*,VECTORF*,MATRIXF*,MATRIXF*,MATRIXF*,MATRIXF*,char,size_t),size_t memlimit) +int pij_cassist(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,char nodiag,size_t memlimit) { #define CLEANUP CLEANVECF(p1)CLEANMATF(p2)CLEANMATF(p3)CLEANMATF(p4) VECTORF *p1; @@ -202,7 +200,7 @@ static int pij_cassist_any(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,M p4=MATRIXFF(alloc)(ng,nt); if(!(p1&&p2&&p3&&p4)) ERRRET("Not enough memory.") - if(pijs(g,t,t2,p1,p2,p3,p4,ans,nodiag,memlimit)) + if(pijs_cassist(g,t,t2,p1,p2,p3,p4,ans,nodiag,memlimit)) ERRRET("pij_cassist_pijs failed.") //Combine tests @@ -227,16 +225,6 @@ static int pij_cassist_any(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,M #undef CLEANUP } -int pij_cassist_a(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,char nodiag,size_t memlimit) -{ - return pij_cassist_any(g,t,t2,ans,nodiag,pijs_cassist_a,memlimit); -} - -int pij_cassist(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,char nodiag,size_t memlimit) -{ - return pij_cassist_a(g,t,t2,ans,nodiag,memlimit); -} - int pij_cassist_trad(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,char nodiag,size_t memlimit) { #define CLEANUP CLEANVECF(p1)CLEANMATF(p2)CLEANMATF(p4)CLEANMATF(p5) @@ -254,7 +242,7 @@ int pij_cassist_trad(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF p5=MATRIXFF(alloc)(ng,nt); if(!(p1&&p2&&p5&&p4)) ERRRET("Not enough memory.") - if(pijs_cassist_a(g,t,t2,p1,p2,ans,p4,p5,nodiag,memlimit)) + if(pijs_cassist(g,t,t2,p1,p2,ans,p4,p5,nodiag,memlimit)) ERRRET("pij_cassist_pijs failed.") //Combine tests diff --git a/pij/cassist/cassist.h b/pij/cassist/cassist.h index 858c915..238a49f 100644 --- a/pij/cassist/cassist.h +++ b/pij/cassist/cassist.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -65,14 +65,12 @@ int pijs_cassist_pv(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* * nt: Number of genes with expression data for B * ns: Number of samples. */ -int pijs_cassist_a(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,char nodiag,size_t memlimit); int pijs_cassist(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,char nodiag,size_t memlimit); /* Estimates the probability of A->B from genotype and expression data with defaults combination of tests. Uses results from pijs_gassist_tot or pijs_gassist_a. Variables have the same definitions except: * ans: (ng,nt) Predicted probability of A->B based on default combination of 5 tests. The default combination is (p2*p5+p4)/2. Note: this combination does not include p1. * Return: 0 on sucess */ -int pij_cassist_a(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,char nodiag,size_t memlimit); int pij_cassist(const MATRIXF* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,char nodiag,size_t memlimit); /* Estimates the probability of A->B from genotype and expression data with traditional causal inference method. diff --git a/pij/cassist/llr.c b/pij/cassist/llr.c index e3ae584..f92878d 100644 --- a/pij/cassist/llr.c +++ b/pij/cassist/llr.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -102,7 +102,6 @@ static void pij_cassist_llr_block(const MATRIXF* g,const MATRIXF* t,const MATRIX MATRIXFF(set)(llr5,i,j,(FTYPE)log(MATRIXFF(get)(llr5,i,j))); } } - //llr4=llr4 before scaling -0.5 for(i=0;i #include +#include #include "../../base/logger.h" #include "llrtopij.h" +#include "../llrtopij.h" -int pij_cassist_llrtopij1_1(VECTORF* p1) + + + + + +/* Always return probability of step 1 is 1. This is useful when best eQTL are already selected in advance. + */ +static inline int pij_cassist_llrtopij1_1(VECTORF* p1) { LOG(9,"Converting LLR to probabilities for step 1. Filling with 1.") VECTORFF(set_all)(p1,1); return 0; } +/* Functions to convert LLR of specific steps into probabilities. + * Uses pij_llrtopij_convert with different settings of n1d and n2d. + * Function name suffices indicate which LLR to convert. + */ +static inline int pij_cassist_llrtopij1(VECTORF* d) +{ + LOG(9,"Converting LLR to probabilities for step 1 on per A basis.") + return pij_cassist_llrtopij1_1(d); +} + +static inline int pij_cassist_llrtopij2(MATRIXF* d,size_t ns,char nodiag) +{ + LOG(9,"Converting LLR to probabilities for step 2 on per A basis.") + assert(ns>2); + return pij_llrtopij_convert_single_self(d,1,ns-2,nodiag,0); +} + +static inline int pij_cassist_llrtopij3(MATRIXF* d,size_t ns,char nodiag) +{ + LOG(9,"Converting LLR to probabilities for step 3 on per A basis.") + assert(ns>3); + if(pij_llrtopij_convert_single_self(d,1,ns-3,nodiag,0)) + return 1; + MATRIXFF(scale)(d,-1); + MATRIXFF(add_constant)(d,1); + return 0; +} + +static inline int pij_cassist_llrtopij4(MATRIXF* d,size_t ns,char nodiag) +{ + LOG(9,"Converting LLR to probabilities for step 4 on per A basis.") + assert(ns>3); + return pij_llrtopij_convert_single_self(d,2,ns-3,nodiag,0); +} + +static inline int pij_cassist_llrtopij5(MATRIXF* d,size_t ns,char nodiag) +{ + LOG(9,"Converting LLR to probabilities for step 5 on per A basis.") + assert(ns>3); + return pij_llrtopij_convert_single_self(d,1,ns-3,nodiag,0); +} + + +int pij_cassist_llrtopijs(VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t ns,char nodiag) +{ + int ret=0,ret2=0; + + if(ns<4) + { + LOG(0,"Cannot convert log likelihood ratios to probabilities. Needs at least 4 samples.") + return 1; + } + ret=ret||(ret2=pij_cassist_llrtopij2(p2,ns,nodiag)); + if(ret2) + LOG(1,"Failed to convert log likelihood ratios to probabilities in step 2.") + //For p1, if nodiag, copy p2 data, otherwise set all to 1. + if(nodiag) + { + VECTORFF(view) vv; + vv=MATRIXFF(diagonal)(p2); + ret=ret||(ret2=VECTORFF(memcpy)(p1,&vv.vector)); + } + else + ret=ret||(ret2=pij_cassist_llrtopij1_1(p1)); + if(ret2) + LOG(1,"Failed to convert log likelihood ratios to probabilities in step 1.") + ret=ret||(ret2=pij_cassist_llrtopij3(p3,ns,nodiag)); + if(ret2) + LOG(1,"Failed to convert log likelihood ratios to probabilities in step 3.") + ret=ret||(ret2=pij_cassist_llrtopij4(p4,ns,nodiag)); + if(ret2) + LOG(1,"Failed to convert log likelihood ratios to probabilities in step 4.") + ret=ret||(ret2=pij_cassist_llrtopij5(p5,ns,nodiag)); + if(ret2) + LOG(1,"Failed to convert log likelihood ratios to probabilities in step 5.") + return ret; +} + diff --git a/pij/cassist/llrtopij.h b/pij/cassist/llrtopij.h index b7ec39b..3401786 100644 --- a/pij/cassist/llrtopij.h +++ b/pij/cassist/llrtopij.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -23,16 +23,20 @@ #define _HEADER_LIB_PIJ_CASSIST_LLRTOPIJ_H_ #include "../../base/config.h" #include "../../base/types.h" -#include "llrtopij_a.h" #ifdef __cplusplus extern "C" { #endif - -/* Always return probability of step 1 is 1. This is useful when best eQTL are already selected in advance. +/* Converts four LLRs into probabilities together. + * Uses pij_cassit_llrtopij1_a to pij_cassit_llrtopij5_a. + * See above functions for parameter definitions. + * Return: 0 if all functions are successful. */ -int pij_cassist_llrtopij1_1(VECTORF* p1); +int pij_cassist_llrtopijs(VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t ns,char nodiag); + + + diff --git a/pij/cassist/llrtopij_a.c b/pij/cassist/llrtopij_a.c deleted file mode 100644 index 7a4f286..0000000 --- a/pij/cassist/llrtopij_a.c +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "../../base/config.h" -#include -#include -#include -#include -#include "../../base/logger.h" -#include "../llrtopij.h" -#include "llrtopij_a.h" -#include "llrtopij.h" -#pragma GCC diagnostic ignored "-Wunused-parameter" - -/* Functions to convert LLR of specific steps into probabilities. - * Uses pij_llrtopij_a_convert with different settings of n1d and n2d. - * Function name suffices indicate which LLR to convert. - */ -static inline int pij_cassist_llrtopij1_a(VECTORF* d) -{ - LOG(9,"Converting LLR to probabilities for step 1 on per A basis.") - return pij_cassist_llrtopij1_1(d); -} - -static inline int pij_cassist_llrtopij2_a(MATRIXF* d,size_t ns,char nodiag) -{ - LOG(9,"Converting LLR to probabilities for step 2 on per A basis.") - assert(ns>2); - return pij_llrtopij_a_convert_single_self(d,1,ns-2,nodiag,0); -} - -static inline int pij_cassist_llrtopij3_a(MATRIXF* d,size_t ns,char nodiag) -{ - LOG(9,"Converting LLR to probabilities for step 3 on per A basis.") - assert(ns>3); - if(pij_llrtopij_a_convert_single_self(d,1,ns-3,nodiag,0)) - return 1; - MATRIXFF(scale)(d,-1); - MATRIXFF(add_constant)(d,1); - return 0; -} - -static inline int pij_cassist_llrtopij4_a(MATRIXF* d,size_t ns,char nodiag) -{ - LOG(9,"Converting LLR to probabilities for step 4 on per A basis.") - assert(ns>3); - return pij_llrtopij_a_convert_single_self(d,2,ns-3,nodiag,0); -} - -static inline int pij_cassist_llrtopij5_a(MATRIXF* d,size_t ns,char nodiag) -{ - LOG(9,"Converting LLR to probabilities for step 5 on per A basis.") - assert(ns>3); - return pij_llrtopij_a_convert_single_self(d,1,ns-3,nodiag,0); -} - - -int pij_cassist_llrtopijs_a(VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t ns,char nodiag) -{ - int ret=0,ret2=0; - - if(ns<4) - { - LOG(0,"Cannot convert log likelihood ratios to probabilities. Needs at least 4 samples.") - return 1; - } - ret=ret||(ret2=pij_cassist_llrtopij2_a(p2,ns,nodiag)); - if(ret2) - LOG(1,"Failed to convert log likelihood ratios to probabilities in step 2.") - //For p1, if nodiag, copy p2 data, otherwise set all to 1. - if(nodiag) - { - VECTORFF(view) vv; - vv=MATRIXFF(diagonal)(p2); - ret=ret||(ret2=VECTORFF(memcpy)(p1,&vv.vector)); - } - else - ret=ret||(ret2=pij_cassist_llrtopij1_1(p1)); - if(ret2) - LOG(1,"Failed to convert log likelihood ratios to probabilities in step 1.") - ret=ret||(ret2=pij_cassist_llrtopij3_a(p3,ns,nodiag)); - if(ret2) - LOG(1,"Failed to convert log likelihood ratios to probabilities in step 3.") - ret=ret||(ret2=pij_cassist_llrtopij4_a(p4,ns,nodiag)); - if(ret2) - LOG(1,"Failed to convert log likelihood ratios to probabilities in step 4.") - ret=ret||(ret2=pij_cassist_llrtopij5_a(p5,ns,nodiag)); - if(ret2) - LOG(1,"Failed to convert log likelihood ratios to probabilities in step 5.") - return ret; -} - - - - - - - - - - - diff --git a/pij/cassist/llrtopij_a.h b/pij/cassist/llrtopij_a.h deleted file mode 100644 index 5401e53..0000000 --- a/pij/cassist/llrtopij_a.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This file contains the conversion from log likelihood ratio to probabilities - * - */ - -#ifndef _HEADER_LIB_PIJ_CASSIST_LLRTOPIJ_A_H_ -#define _HEADER_LIB_PIJ_CASSIST_LLRTOPIJ_A_H_ -#include "../../base/config.h" -#include "../../base/types.h" -#include "../llrtopij_a.h" -#ifdef __cplusplus -extern "C" -{ -#endif - -/* Converts four LLRs into probabilities together. - * Uses pij_cassit_llrtopij1_a to pij_cassit_llrtopij5_a. - * See above functions for parameter definitions. - * Return: 0 if all functions are successful. - */ -int pij_cassist_llrtopijs_a(VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t ns,char nodiag); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/cassist/llrtopv.h b/pij/cassist/llrtopv.h index 8519299..24179ae 100644 --- a/pij/cassist/llrtopv.h +++ b/pij/cassist/llrtopv.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -29,21 +29,19 @@ extern "C" { #endif - - /* Converts log likelihood ratios into p-values for continuous assisted causal inference test for each test separately. * d: MATRIXF of any size, as input of LLRs and also output of corresponding p-values * ns: Number of samples, to be used to calculate the null distribution */ static inline void pij_cassist_llrtopv1(VECTORF* d,size_t ns) { - assert(ns>2); + assert(ns>3); pij_llrtopv_block(d,1,ns-2); } static inline void pij_cassist_llrtopv2(MATRIXF* d,size_t ns) { - assert(ns>2); + assert(ns>3); pij_llrtopvm(d,1,ns-2); } diff --git a/pij/gassist/gassist.c b/pij/gassist/gassist.c index 0d33ef9..081f531 100644 --- a/pij/gassist/gassist.c +++ b/pij/gassist/gassist.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -24,12 +24,14 @@ #include "../../base/const.h" #include "../../base/supernormalize.h" #include "../../base/threading.h" +#include "../../base/data_process.h" +#include "../llrtopij.h" #include "llr.h" #include "llrtopv.h" #include "llrtopij.h" +#include "nullhist.h" #include "gassist.h" - int pijs_gassist_pv(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,size_t memlimit) { #define CLEANUP CLEANMATF(tnew)CLEANMATF(tnew2) @@ -113,20 +115,17 @@ int pijs_gassist_pv(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* #undef CLEANUP } - -int pijs_gassist_a(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag,size_t memlimit) +int pijs_gassist(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag,size_t memlimit) { -#define CLEANUP CLEANMATF(tnew)CLEANMATF(tnew2) - MATRIXF *tnew,*tnew2; //(nt,ns) Supernormalized transcript matrix +#define CLEANUP CLEANMATF(tnew)CLEANMATF(tnew2)for(i=0;i<4;i++){if(hnull[i])for(j=0;jsize1; -#endif ng=g->size1; ns=g->size2; @@ -152,7 +151,7 @@ int pijs_gassist_a(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* ERRRET("Memory limit lower than minimum memory needed. Try increasing your memory usage limit.") LOG(10,"Memory limit: %lu bytes.",memlimit) nsplit=(size_t)ceil((float)ng/ceil((float)ng/(float)nsplit)); - //if(nsplitsize2); mvp4=MATRIXFF(submatrix)(p4,i,0,ngnow,p4->size2); mvp5=MATRIXFF(submatrix)(p5,i,0,ngnow,p5->size2); - //Step 2: Log likelihood ratios from nonpermuted data - LOG(9,"Calculating real log likelihood ratios...") if(pij_gassist_llr(&mvg.matrix,&mvt.matrix,tnew2,&vvp1.vector,&mvp2.matrix,&mvp3.matrix,&mvp4.matrix,&mvp5.matrix,nv)) ERRRET("pij_gassist_llr failed.") - //Step 3: Convert log likelihood ratios to probabilities - if((ret=pij_gassist_llrtopijs_a(&mvg.matrix,&vvp1.vector,&mvp2.matrix,&mvp3.matrix,&mvp4.matrix,&mvp5.matrix,nv,nodiag,(long)i))) + } + + //Step 3: Obtain null histograms + { + FTYPE dmax[4]; + dmax[0]=pij_llrtopij_llrmatmax(p2,nodiag); + dmax[1]=pij_llrtopij_llrmatmax(p3,nodiag); + dmax[2]=pij_llrtopij_llrmatmax(p4,nodiag); + dmax[3]=pij_llrtopij_llrmatmax(p5,nodiag); + if(!(dmax[0]&&dmax[1]&&dmax[2]&&dmax[3])) + ERRRET("Negative or NAN found in LLR.") + if(pij_gassist_nullhists(hnull,nt,ns,nv,dmax)) + ERRRET("Failed to construct null histograms.") + } + + //Step 4: Convert log likelihood ratios to probabilities + for(i=0;isize2); + mvt=MATRIXFF(submatrix)(tnew,i,0,ngnow,tnew->size2); + vvp1=VECTORFF(subvector)(p1,i,ngnow); + mvp2=MATRIXFF(submatrix)(p2,i,0,ngnow,p2->size2); + mvp3=MATRIXFF(submatrix)(p3,i,0,ngnow,p3->size2); + mvp4=MATRIXFF(submatrix)(p4,i,0,ngnow,p4->size2); + mvp5=MATRIXFF(submatrix)(p5,i,0,ngnow,p5->size2); + if((ret=pij_gassist_llrtopijs(&mvg.matrix,&vvp1.vector,&mvp2.matrix,&mvp3.matrix,&mvp4.matrix,&mvp5.matrix,nv,(const gsl_histogram* const **)hnull,nodiag,(long)i))) LOG(4,"Failed to convert all log likelihood ratios to probabilities.") if(nodiag) { @@ -207,17 +239,7 @@ int pijs_gassist_a(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* #undef CLEANUP } -int pijs_gassist(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag,size_t memlimit) -{ - return pijs_gassist_a(g,t,t2,p1,p2,p3,p4,p5,nv,nodiag,memlimit); -} - -/* Estimates the probability p(E->A->B) from genotype and expression data. Combines results - * from any pij_gassist_pijs. For more information, see pij_gassist_pijs_ab. - * ans: (ng,nt) Output matrix for probabilities. ans[A,B] is p(E->A->B). - * pijs: Function to calculate pijs. - */ -static int pij_gassist_any(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,int (*pijs)(const MATRIXG*,const MATRIXF*,const MATRIXF*,VECTORF*,MATRIXF*,MATRIXF*,MATRIXF*,MATRIXF*,size_t,char,size_t),size_t memlimit) +int pij_gassist(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,size_t memlimit) { #define CLEANUP CLEANVECF(p1)CLEANMATF(p2)CLEANMATF(p3)CLEANMATF(p4) VECTORF *p1; @@ -234,7 +256,7 @@ static int pij_gassist_any(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,M p4=MATRIXFF(alloc)(ng,nt); if(!(p1&&p2&&p3&&p4)) ERRRET("Not enough memory.") - if(pijs(g,t,t2,p1,p2,p3,p4,ans,nv,nodiag,memlimit)) + if(pijs_gassist(g,t,t2,p1,p2,p3,p4,ans,nv,nodiag,memlimit)) ERRRET("pij_gassist_pijs failed.") //Combine tests @@ -259,16 +281,6 @@ static int pij_gassist_any(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,M #undef CLEANUP } -int pij_gassist_a(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,size_t memlimit) -{ - return pij_gassist_any(g,t,t2,ans,nv,nodiag,pijs_gassist_a,memlimit); -} - -int pij_gassist(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,size_t memlimit) -{ - return pij_gassist_a(g,t,t2,ans,nv,nodiag,memlimit); -} - int pij_gassist_trad(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,size_t memlimit) { #define CLEANUP CLEANVECF(p1)CLEANMATF(p2)CLEANMATF(p4)CLEANMATF(p5) @@ -286,7 +298,7 @@ int pij_gassist_trad(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF p5=MATRIXFF(alloc)(ng,nt); if(!(p1&&p2&&p5&&p4)) ERRRET("Not enough memory.") - if(pijs_gassist_a(g,t,t2,p1,p2,ans,p4,p5,nv,nodiag,memlimit)) + if(pijs_gassist(g,t,t2,p1,p2,ans,p4,p5,nv,nodiag,memlimit)) ERRRET("pij_gassist_pijs failed.") //Combine tests diff --git a/pij/gassist/gassist.h b/pij/gassist/gassist.h index 66fd126..9cda131 100644 --- a/pij/gassist/gassist.h +++ b/pij/gassist/gassist.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -66,19 +66,17 @@ int pijs_gassist_pv(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* * nt: Number of genes with expression data for B * ns: Number of samples. */ -int pijs_gassist_a(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag,size_t memlimit); int pijs_gassist(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag,size_t memlimit); -/* Estimates the probability of A->B from genotype and expression data with defaults combination of tests. Uses results from pijs_gassist_tot or pijs_gassist_a. Variables have the same definitions except: +/* Estimates the probability of A->B from genotype and expression data with defaults combination of tests. Uses results from pijs_gassist. Variables have the same definitions except: * ans: (ng,nt) Predicted probability of A->B based on default combination of 5 tests. The default combination is (p2*p5+p4)/2. Note: this combination does not include p1. * Return: 0 on sucess */ -int pij_gassist_a(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,size_t memlimit); int pij_gassist(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,size_t memlimit); /* Estimates the probability of A->B from genotype and expression data with traditional causal inference method. * NOTE: This is not and is not intended as a loyal reimplementation of the Trigger R package. Instead, it aims at reusing methods and tests of Findr to produce inferences that mimicks the three tests performed by Trigger. Many implementational details are different between this function and Trigger, althrough a significant (but not full) overlap has been observed in existing studies. This method does not include p1. - * Inputs and ouputs are the same as function pij_gassist_a. + * Inputs and ouputs are the same as function pij_gassist. */ int pij_gassist_trad(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,MATRIXF* ans,size_t nv,char nodiag,size_t memlimit); diff --git a/pij/gassist/llr.c b/pij/gassist/llr.c index 0b200b1..56904de 100644 --- a/pij/gassist/llr.c +++ b/pij/gassist/llr.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -57,38 +57,6 @@ struct pij_gassist_llr_block_buffed_params{ MATRIXF** mb1; }; -void pij_gassist_llr_ratioandmean_1v1(const MATRIXG* g,const MATRIXF* t1,const MATRIXF* t2,MATRIXF* mratio,MATRIXF* mmean1,MATRIXF* mmean2,size_t nv) -{ - size_t ng=g->size1; - size_t ns=g->size2; - size_t i,j; - size_t val; - FTYPE f1; - - MATRIXFF(set_zero)(mratio); - MATRIXFF(set_zero)(mmean1); - MATRIXFF(set_zero)(mmean2); - - //Calculating sums - for(i=0;iB with A--B v.s. E->A<-B. A and B can have different sets of transcripts here. - * This is a more conservative version. - * g: MATRIXF (ng,ns) of genotype data - * t: MATRIXF (ng,ns) of transcript data for A - * t2: MATRIXF (nt,ns) of transcript data for B - * nv: number of possible values of g. - * llr2: MATRIXF (ng,nt) of output - * mratio: MATRIXF (nv,ng) of the ratio of samples for each SNP type. For buffer purpose. - * mmean: MATRIXF (nv,ng) of the means of each transcript among the samples of a specific SNP type. For buffer purpose. - * mmean2: MATRIXF[nv] (ng,nt) of the means of each transcript among the samples of a specific SNP type. For buffer purpose. - * mmb1: MATRIXF[nv] (ng,ns) Buffer matrix - * vb1: const VECTORF (ns). Must be set to 1 for all elements. - */ -//void pij_gassist_llr2c_buffed(const MATRIXG* g,const MATRIXF* t,const MATRIXF* tp,size_t nv,MATRIXF* llr2,MATRIXF* mratio,MATRIXF* mmean1,MATRIXF** mmean2,MATRIXF** mmb1,const VECTORF* vb1); - -//int pij_gassist_llr2c(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,size_t nv,MATRIXF* llr2); - -/* Calculates the log likelihood ratio of step 2 when the transcripts of those with eQTLs are different with those - * to be tested against: A<-E->B with A--B v.s. E->A. A and B can have different sets of transcripts here. - * This is a less conservative version because it does not exclude B->A. - * g: MATRIXF (ng,ns) of genotype data - * t: MATRIXF (ng,ns) of transcript data for A - * t2: MATRIXF (nt,ns) of transcript data for B - * nv: number of possible values of g. - * llr2: MATRIXF (ng,nt) of output - * mratio: MATRIXF (nv,ng) of the ratio of samples for each SNP type. For buffer purpose. - * mmean: MATRIXF (nv,ng) of the means of each transcript among the samples of a specific SNP type. For buffer purpose. - * mmean2: MATRIXF[nv] (ng,nt) of the means of each transcript among the samples of a specific SNP type. For buffer purpose. - * mmb1: MATRIXF[nv] (ng,ns) Buffer matrix - * mmb2: MATRIXF[nv] (ng,nt) Buffer matrix - * mb1: MATRIXF (ng,nt). Buffer matrix - * mb2: MATRIXF (ng,nt). Buffer matrix - * mb3: MATRIXF (ng,nt). Buffer matrix - * vb1: const VECTORF (ns). Must be set to 1 for all elements. - * vb2: VECTORF (ng). Buffer vector - */ -//void pij_gassist_llr2b_buffed(const MATRIXG* g,const MATRIXF* t,const MATRIXF* tp,size_t nv,MATRIXF* llr2,MATRIXF* mratio,MATRIXF* mmean1,MATRIXF** mmean2,MATRIXF** mmb1,MATRIXF** mmb2,MATRIXF* mb1,MATRIXF* mb2,MATRIXF* mb3,const VECTORF* vb1,VECTORF* vb2); - -/* Calculates the log likelihood ratio of step 2 when the transcripts of those with eQTLs are different with those - * to be tested against: A<-E->B with A--B v.s. E->A<-B. A and B can have different sets of transcripts here. - * g: MATRIXF (ng,ns) of genotype data - * t: MATRIXF (ng,ns) of transcript data for A - * t2: MATRIXF (nt,ns) of transcript data for B - * nv: number of possible values of g. - * llr2: MATRIXF (ng,nt) of output - * Return: 0 on success - */ -//int pij_gassist_llr2b(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,size_t nv,MATRIXF* llr2); - -//#define pij_gassist_llr2 pij_gassist_llr2_bold -//#define pij_gassist_llr2_buffed pij_gassist_llr2_bold_buffed - -/* Calculate log likelihood ratio of step 3 for single (E,A,B): - * log likelihood(E->A->B)-log likelihood(A<-E->B with A--B) - * fcov: covariance (A,B) for each B - * vratio: (nv) categorical ratio of each value of g - * vmean1: (nv) categorical mean of A - * vmean2: (nv) categorical mean of B - * vb1: (nv) buffer vector f_alpha mu_{alpha 1} - * Return: log likelihood ratio - */ -//FTYPE pij_gassist_llr3_one_from_stats_buffed(FTYPE fcov,const VECTORF* vratio,const VECTORF* vmean1,const VECTORF* vmean2,VECTORF* vb1); - -/* Calculate log likelihood ratio of step 3 for single (E,A) but multiple B: - * log likelihood(E->A->B)-log likelihood(A<-E->B with A--B) - * g: (ns) genotype data (E), each=0,1,...,nv-1 - * t1: (ns) Supernormalizedtranscript data for A - * t2: (nt,ns) Supernormalized transcript data for Bs - * nv: Number of values each entry of g can take. - * llr3: (nt) output buffer for calculated llr - * vb1: (ns) constant buffer, must be set to all 1 initially - * vratio: (nv) categorical ratio of each value of g - * vmean1: (nv) categorical mean of A - * mmean: (nt,nv) categorical mean of each B for each value of g - * vcov: (nt) covariance (A,B) for each B - * vb2: (nv) buffer vector f_alpha mu_{alpha 1} - * vb3: (nt) buffer matrix for f_alpha mu_{alpha 2} - * mb1: (nv,ns) buffer matrix of expanded representation of sample category - */ -//void pij_gassist_llr3_E1_buffed(const VECTORG* g,const VECTORF* t1,const MATRIXF* t2,size_t nv,VECTORF* llr3,const VECTORF* vb1,VECTORF *vratio,VECTORF* vmean1,MATRIXF *mmean,VECTORF *vcov,VECTORF *vb2,VECTORF *vb3,MATRIXF *mb1); - -/* Calculates log likelihood ratio for block of step 3 with buffer provided - * g: MATRIXF (ng,ns) Full genotype data matrix - * t: MATRIXF (ng,ns) Supernormalized transcript data matrix for A - * t2: MATRIXF (nt,ns) Supernormalized transcript data matrix for B - * nv: Number of possible values for each genotype - * llr3: MATRIXF (ng,nt). Log likelihood ratios for test 3. - * vb1: VECTORF (ns). Constant buffer vector, must be set to 1 initially. - * mratio: MATRIXF (nv,ng). Buffer matrix for categorical ratio - * mmean1: MATRIXF (nv,ng). Buffer matrix set for categorical mean - * mmean2: MATRIXF[nv] (ng,nt). Buffer matrix set for categorical mean - * mcov: MATRIXF (ng,nt). Buffer covariance matrix - * mmb1: MATRIXF[nv] (ng,nt). Buffer matrix - * mmb2: MATRIXF[nv] (ng,ns). Buffer matrix - */ -//void pij_gassist_llr3_buffed(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,size_t nv,MATRIXF* llr3,const VECTORF* vb1,MATRIXF *mratio,MATRIXF *mmean1,MATRIXF **mmean2,MATRIXF *mcov,MATRIXF **mmb1,MATRIXF **mmb2); - /* Multithread calculation of log likelihood ratios for 5 tests. * g: MATRIXF (ng,ns) Full genotype data matrix * t: MATRIXF (ng,ns) Supernormalized transcript data matrix of A @@ -172,6 +62,10 @@ int pij_gassist_llr(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,VECTORF* + + + + diff --git a/pij/gassist/llrtopij.c b/pij/gassist/llrtopij.c index c81bf49..4a826a7 100644 --- a/pij/gassist/llrtopij.c +++ b/pij/gassist/llrtopij.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -18,22 +18,210 @@ #include "../../base/config.h" #include #include -#include -#include #include +#include "../../base/gsl/math.h" +#include "../../base/gsl/histogram.h" +#include "../../base/gsl/blas.h" #include "../../base/logger.h" +#include "../../base/threading.h" #include "../../base/macros.h" #include "../../base/data_process.h" +#include "../llrtopij.h" #include "llrtopij.h" -int pij_gassist_llrtopij1_1(VECTORF* p1) + +/* Always return probability of step 1 is 1. This is useful when best eQTL are already selected in advance. + */ +static inline int pij_gassist_llrtopij1_1(VECTORF* p1) { LOG(9,"Converting LLR to probabilities for step 1. Filling with 1.") VECTORFF(set_all)(p1,1); return 0; } +/* Convert real log likelihood ratios into probability functions. + * This function converts every A in hypothesis (E->A->B) separately. + * Suppose there are ng (E,A) pairs and nt Bs, this function converts ng times, + * each for one (E,A) pair but all Bs. + * d: (ng,nt) Input log likelihood ratios for construction of + * histograms and calculation of probability of true hypothesis. + * g: (ng,ns) Original genotype matrix, used for analytical calculation + * of null distribution. Every element=0,1,...,nv-1. + * h: [nv-1]. Null histogram of the specific test. + * Output of pij_nullhist. + * nv: Maximum number of values each g may take. + * nodiag: If diagonal elements of d should be removed in construction of real + * histogram. This should be set to true (!=0) when t is identical with + * the top rows of t2 (in calculation of llr). + * Return: 0 if success. + */ +static int pij_gassist_llrtopij_convert_self(MATRIXF* d,const MATRIXG* g,const gsl_histogram * const * h, size_t nv,char nodiag,long nodiagshift) +{ +#define CLEANUP CLEANVECG(vcount)CLEANAMHIST(hreal,nth)CLEANAMHIST(hc,nth)\ + CLEANMATD(mb1)CLEANMATD(mb2)CLEANMATD(mnull)CLEANMATF(mb3)CLEANVECD(vwidth) + + VECTORG *vcount; + size_t ng=g->size1; + size_t i,nbin; + //gsl_histogram **hreal,**hc; + MATRIXD *mb1,*mb2,*mnull; + MATRIXF *mb3; + VECTORD *vwidth; + VECTORDF(view) vv1; + size_t nth; + + mb1=mb2=mnull=0; + mb3=0; + vwidth=0; + vcount=0; + //Validity checks + { + int nth0=omp_get_max_threads(); + assert(nth0>0); + nth=(size_t)nth0; + } + + + AUTOCALLOC(gsl_histogram*,hreal,nth,64) + AUTOCALLOC(gsl_histogram*,hc,nth,64) + if(!(hreal&&hc)) + ERRRET("Not enough memory."); + + //Construct null density histograms + nbin=h[0]->n; + //Memory allocation + { + size_t n1,n2; + pij_llrtopij_convert_histograms_get_buff_sizes(nbin,&n1,&n2); + mb1=MATRIXDF(alloc)(nth,n1); + mb2=MATRIXDF(alloc)(nth,n2); + mnull=MATRIXDF(alloc)(nth,nbin); + mb3=MATRIXFF(alloc)(nth,d->size2); + vwidth=VECTORDF(alloc)(nbin); + if(!(mb1&&mb2&&mnull&&mb3&&vwidth)) + ERRRET("Not enough memory.") + } + + //Prepare for real histogram + { + int ret; + for(i=0,ret=1;in+2); + ret=ret&&hreal[i]&&hc[i]; + } + vcount=VECTORGF(alloc)(ng); + if(!(ret&&vcount)) + ERRRET("Not enough memory."); + } + + { + VECTORUC *vb4=VECTORUCF(alloc)(nv); + if(!vb4) + ERRRET("Not enough memory."); + MATRIXGF(countv_byrow_buffed)(g,vcount,vb4); + CLEANVECUC(vb4) + } + + //Conversion + for(i=2;i<=nv;i++) + { + vv1=VECTORDF(view_array)(h[i-2]->range+1,nbin); + VECTORDF(memcpy)(vwidth,&vv1.vector); + vv1=VECTORDF(view_array)(h[i-2]->range,nbin); + VECTORDF(sub)(vwidth,&vv1.vector); + vv1=VECTORDF(view_array)(h[i-2]->bin,nbin); + #pragma omp parallel + { + size_t ng1,ng2,id; + size_t j; + long k; + VECTORDF(view) vvreal,vvnull,vvb1,vvb2; + VECTORFF(view) vvb3,vva; + + id=(size_t)omp_get_thread_num(); + vvreal=VECTORDF(view_array)(hreal[id]->bin,nbin); + vvnull=MATRIXDF(row)(mnull,id); + vvb1=MATRIXDF(row)(mb1,id); + vvb2=MATRIXDF(row)(mb2,id); + vvb3=MATRIXFF(row)(mb3,id); + threading_get_startend(ng,&ng1,&ng2); + + for(j=ng1;jrange,h[i-2]->range,(nbin+1)*sizeof(*hreal[id]->range)); + memset(hreal[id]->bin,0,nbin*sizeof(*hreal[id]->bin)); + //Construct real histogram + if(nodiag&&((long)j+nodiagshift>=0)&&((long)j+nodiagshift<(long)d->size2)) + { + for(k=(long)j+nodiagshift-1;k>=0;k--) + gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); + for(k=(long)j+nodiagshift+1;k<(long)d->size2;k++) + gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); + VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2-1)); + } + else + { + for(k=0;k<(long)d->size2;k++) + gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); + VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2)); + } + + //Convert to density histogram + VECTORDF(div)(&vvreal.vector,vwidth); + //Convert to probability central histogram + pij_llrtopij_convert_histograms_buffed(hreal[id],&vvnull.vector,hc[id],&vvb1.vector,&vvb2.vector); + //Convert likelihoods to probabilities + vva=MATRIXFF(row)(d,j); + pij_llrtopij_histogram_interpolate_linear(hc[id],&vvb3.vector,&vva.vector); + } + } + } + CLEANUP + return 0; +#undef CLEANUP +} + +int pij_gassist_llrtopijs(const MATRIXG* g,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,const gsl_histogram * const * h[4],char nodiag,long nodiagshift) +{ + int ret=0,ret2=0; + if(g->size2<=3) + { + LOG(0,"Needs at least 4 samples to compute probabilities.") + return 1; + } + ret=ret||(ret2=pij_gassist_llrtopij_convert_self(p2,g,h[0],nv,nodiag,nodiagshift)); + if(ret2) + LOG(1,"Failed to log likelihood ratios to probabilities in step 2.") + //For p1, if nodiag, copy p2 data, otherwise set all to 1. + if(nodiag) + { + VECTORFF(view) vv; + vv=MATRIXFF(superdiagonal)(p2,(size_t)nodiagshift); + ret=(ret2=VECTORFF(memcpy)(p1,&vv.vector)); + } + else + ret=(ret2=pij_gassist_llrtopij1_1(p1)); + if(ret2) + LOG(1,"Failed to log likelihood ratios to probabilities in step 1.") + ret=ret||(ret2=pij_gassist_llrtopij_convert_self(p3,g,h[1],nv,nodiag,nodiagshift)); + if(ret2) + LOG(1,"Failed to log likelihood ratios to probabilities in step 3.") + MATRIXFF(scale)(p3,-1); + MATRIXFF(add_constant)(p3,1); + ret=ret||(ret2=pij_gassist_llrtopij_convert_self(p4,g,h[2],nv,nodiag,nodiagshift)); + if(ret2) + LOG(1,"Failed to log likelihood ratios to probabilities in step 4.") + ret=ret||(ret2=pij_gassist_llrtopij_convert_self(p5,g,h[3],nv,nodiag,nodiagshift)); + if(ret2) + LOG(1,"Failed to log likelihood ratios to probabilities in step 5.") + return ret; +} diff --git a/pij/gassist/llrtopij.h b/pij/gassist/llrtopij.h index a1a8a91..53b51b9 100644 --- a/pij/gassist/llrtopij.h +++ b/pij/gassist/llrtopij.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -24,17 +24,20 @@ #include "../../base/config.h" #include "../../base/gsl/histogram.h" #include "../../base/types.h" -#include "llrtopij_tot.h" -#include "llrtopij_a.h" #ifdef __cplusplus extern "C" { #endif -/* Always return probability of step 1 is 1. This is useful when best eQTL are already selected in advance. + +/* Converts four LLRs into probabilities together. + * Uses pij_gassist_llrtopij1 to pij_gassist_llrtopij5. + * See above functions for parameter definitions. + * h: Null histograms. 0 to 3 for tests 2 to 5. + * Return: 0 if all functions are successful. */ -int pij_gassist_llrtopij1_1(VECTORF* p1); +int pij_gassist_llrtopijs(const MATRIXG* g,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,const gsl_histogram * const * h[4],char nodiag,long nodiagshift); diff --git a/pij/gassist/llrtopij_a.c b/pij/gassist/llrtopij_a.c deleted file mode 100644 index 1dccc25..0000000 --- a/pij/gassist/llrtopij_a.c +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "../../base/config.h" -#include -#include -#include -#include "../../base/gsl/math.h" -#include "../../base/gsl/histogram.h" -#include "../../base/gsl/blas.h" -#include "../../base/logger.h" -#include "../../base/threading.h" -#include "../../base/macros.h" -#include "../../base/data_process.h" -#include "../../base/histogram.h" -#include "../nulldist.h" -#include "../llrtopij.h" -#include "llrtopij_a.h" -#include "llrtopij.h" -#pragma GCC diagnostic ignored "-Wunused-parameter" - -int pij_gassist_llrtopij_a_convert(const MATRIXF* d,const MATRIXF* dconv,MATRIXF* ans,const MATRIXG* g,size_t nv,long n1c,size_t n1d,long n2c,size_t n2d,char nodiag,long nodiagshift) -{ -#define CLEANUP CLEANVECG(vcount)CLEANAMHIST(hreal,nth)CLEANAMHIST(hc,nth)\ - CLEANMHIST(h,(nv-1))CLEANMATD(mb1)CLEANMATD(mb2)CLEANMATD(mnull)CLEANVECD(vwidth) - - VECTORG *vcount; - size_t ng=g->size1; - size_t i,nbin; - gsl_histogram **h; - //gsl_histogram **hreal,**hc; - MATRIXD *mb1,*mb2,*mnull; - VECTORD *vwidth; - VECTORDF(view) vv1; - size_t nth; - - h=0; - mb1=mb2=mnull=0; - vwidth=0; - vcount=0; - //Validity checks - assert((dconv->size1==ng)&&(ans->size1==ng)&&(d->size1==ng)&&(dconv->size2==ans->size2)); - - { - int nth0=omp_get_max_threads(); - assert(nth0>0); - nth=(size_t)nth0; - } - - - AUTOCALLOC(gsl_histogram*,hreal,nth,64) - AUTOCALLOC(gsl_histogram*,hc,nth,64) - if(!(hreal&&hc)) - ERRRET("Not enough memory."); - - //Construct null density histograms - { - FTYPE dmin,dmax; - MATRIXFF(minmax)(d,&dmin,&dmax); - if((!(dmin>=0))||gsl_isnan(dmax)||gsl_isinf(dmax)) - ERRRET("Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") - h=pij_llrtopij_a_nullhist((double)dmax,nv,d->size2,n1c,n1d,n2c,n2d); - if(!h) - ERRRET("pij_llrtopij_a_nullhist failed.") - nbin=h[0]->n; - } - //Memory allocation - { - size_t n1,n2; - pij_llrtopij_convert_histograms_get_buff_sizes(nbin,&n1,&n2); - mb1=MATRIXDF(alloc)(nth,n1); - mb2=MATRIXDF(alloc)(nth,n2); - mnull=MATRIXDF(alloc)(nth,nbin); - vwidth=VECTORDF(alloc)(nbin); - if(!(mb1&&mb2&&mnull&&vwidth)) - ERRRET("Not enough memory.") - } - - //Prepare for real histogram - { - int ret; - for(i=0,ret=1;in+2); - ret=ret&&hreal[i]&&hc[i]; - } - vcount=VECTORGF(alloc)(ng); - if(!(ret&&vcount)) - ERRRET("Not enough memory."); - } - - { - VECTORUC *vb4=VECTORUCF(alloc)(nv); - if(!vb4) - ERRRET("Not enough memory."); - MATRIXGF(countv_byrow_buffed)(g,vcount,vb4); - CLEANVECUC(vb4) - } - - - //Conversion - vv1=VECTORDF(view_array)(h[0]->range+1,nbin); - VECTORDF(memcpy)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h[0]->range,nbin); - VECTORDF(sub)(vwidth,&vv1.vector); - - - //Conversion - for(i=2;i<=nv;i++) - { - vv1=VECTORDF(view_array)(h[i-2]->range+1,nbin); - VECTORDF(memcpy)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h[i-2]->range,nbin); - VECTORDF(sub)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h[i-2]->bin,nbin); - #pragma omp parallel - { - size_t ng1,ng2,id; - size_t j; - long k; - VECTORDF(view) vvreal,vvnull,vvb1,vvb2; - VECTORFF(view) vva; - - id=(size_t)omp_get_thread_num(); - vvreal=VECTORDF(view_array)(hreal[id]->bin,nbin); - vvnull=MATRIXDF(row)(mnull,id); - vvb1=MATRIXDF(row)(mb1,id); - vvb2=MATRIXDF(row)(mb2,id); - threading_get_startend(ng,&ng1,&ng2); - - for(j=ng1;jrange,h[i-2]->range,(nbin+1)*sizeof(*hreal[id]->range)); - memset(hreal[id]->bin,0,nbin*sizeof(*hreal[id]->bin)); - //Construct real histogram - if(nodiag&&((long)j+nodiagshift>=0)&&((long)j+nodiagshift<(long)d->size2)) - { - for(k=(long)j+nodiagshift-1;k>=0;k--) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - for(k=(long)j+nodiagshift+1;k<(long)d->size2;k++) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2-1)); - } - else - { - for(k=0;k<(long)d->size2;k++) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2)); - } - - //Convert to density histogram - VECTORDF(div)(&vvreal.vector,vwidth); - //Convert to probability central histogram - pij_llrtopij_convert_histograms_buffed(hreal[id],&vvnull.vector,hc[id],&vvb1.vector,&vvb2.vector); - //Convert likelihoods to probabilities - vva=MATRIXFF(row)(ans,j); - pij_llrtopij_histogram_interpolate_linear(hc[id],&vvd.vector,&vva.vector); - } - } - } - CLEANUP - return 0; -#undef CLEANUP -} - -int pij_gassist_llrtopij_a_convert_self(MATRIXF* d,const MATRIXG* g,size_t nv,long n1c,size_t n1d,long n2c,size_t n2d,char nodiag,long nodiagshift) -{ -#define CLEANUP CLEANVECG(vcount)CLEANAMHIST(hreal,nth)CLEANAMHIST(hc,nth)\ - CLEANMHIST(h,(nv-1))CLEANMATD(mb1)CLEANMATD(mb2)CLEANMATD(mnull)CLEANMATF(mb3)CLEANVECD(vwidth) - - VECTORG *vcount; - size_t ng=g->size1; - size_t i,nbin; - gsl_histogram **h; - //gsl_histogram **hreal,**hc; - MATRIXD *mb1,*mb2,*mnull; - MATRIXF *mb3; - VECTORD *vwidth; - VECTORDF(view) vv1; - size_t nth; - - h=0; - mb1=mb2=mnull=0; - mb3=0; - vwidth=0; - vcount=0; - //Validity checks - { - int nth0=omp_get_max_threads(); - assert(nth0>0); - nth=(size_t)nth0; - } - - - AUTOCALLOC(gsl_histogram*,hreal,nth,64) - AUTOCALLOC(gsl_histogram*,hc,nth,64) - if(!(hreal&&hc)) - ERRRET("Not enough memory."); - - //Construct null density histograms - { - FTYPE dmin,dmax; - if(nodiag) - MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,nodiagshift); - else - MATRIXFF(minmax)(d,&dmin,&dmax); - if((!(dmin>=0))||gsl_isnan(dmax)) - ERRRET("Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") - if(gsl_isinf(dmax)) - { - LOG(5,"INF found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps or duplicate rows (by Spearman correlation).") - MATRIXFF(set_inf)(d,-1); - if(nodiag) - MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,nodiagshift); - else - MATRIXFF(minmax)(d,&dmin,&dmax); - MATRIXFF(set_value)(d,-1,dmax); - } - h=pij_llrtopij_a_nullhist((double)dmax,nv,d->size2,n1c,n1d,n2c,n2d); - if(!h) - ERRRET("pij_llrtopij_a_nullhist failed.") - nbin=h[0]->n; - } - //Memory allocation - { - size_t n1,n2; - pij_llrtopij_convert_histograms_get_buff_sizes(nbin,&n1,&n2); - mb1=MATRIXDF(alloc)(nth,n1); - mb2=MATRIXDF(alloc)(nth,n2); - mnull=MATRIXDF(alloc)(nth,nbin); - mb3=MATRIXFF(alloc)(nth,d->size2); - vwidth=VECTORDF(alloc)(nbin); - if(!(mb1&&mb2&&mnull&&mb3&&vwidth)) - ERRRET("Not enough memory.") - } - - //Prepare for real histogram - { - int ret; - for(i=0,ret=1;in+2); - ret=ret&&hreal[i]&&hc[i]; - } - vcount=VECTORGF(alloc)(ng); - if(!(ret&&vcount)) - ERRRET("Not enough memory."); - } - - { - VECTORUC *vb4=VECTORUCF(alloc)(nv); - if(!vb4) - ERRRET("Not enough memory."); - MATRIXGF(countv_byrow_buffed)(g,vcount,vb4); - CLEANVECUC(vb4) - } - - //Conversion - for(i=2;i<=nv;i++) - { - vv1=VECTORDF(view_array)(h[i-2]->range+1,nbin); - VECTORDF(memcpy)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h[i-2]->range,nbin); - VECTORDF(sub)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h[i-2]->bin,nbin); - #pragma omp parallel - { - size_t ng1,ng2,id; - size_t j; - long k; - VECTORDF(view) vvreal,vvnull,vvb1,vvb2; - VECTORFF(view) vvb3,vva; - - id=(size_t)omp_get_thread_num(); - vvreal=VECTORDF(view_array)(hreal[id]->bin,nbin); - vvnull=MATRIXDF(row)(mnull,id); - vvb1=MATRIXDF(row)(mb1,id); - vvb2=MATRIXDF(row)(mb2,id); - vvb3=MATRIXFF(row)(mb3,id); - threading_get_startend(ng,&ng1,&ng2); - - for(j=ng1;jrange,h[i-2]->range,(nbin+1)*sizeof(*hreal[id]->range)); - memset(hreal[id]->bin,0,nbin*sizeof(*hreal[id]->bin)); - //Construct real histogram - if(nodiag&&((long)j+nodiagshift>=0)&&((long)j+nodiagshift<(long)d->size2)) - { - for(k=(long)j+nodiagshift-1;k>=0;k--) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - for(k=(long)j+nodiagshift+1;k<(long)d->size2;k++) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2-1)); - } - else - { - for(k=0;k<(long)d->size2;k++) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2)); - } - - //Convert to density histogram - VECTORDF(div)(&vvreal.vector,vwidth); - //Convert to probability central histogram - pij_llrtopij_convert_histograms_buffed(hreal[id],&vvnull.vector,hc[id],&vvb1.vector,&vvb2.vector); - //Convert likelihoods to probabilities - vva=MATRIXFF(row)(d,j); - pij_llrtopij_histogram_interpolate_linear(hc[id],&vvb3.vector,&vva.vector); - } - } - } - CLEANUP - return 0; -#undef CLEANUP -} - -/* Functions to convert LLR of specific steps into probabilities. - * Uses pij_llrtopij_a_convert with different settings of n1d and n2d. - * Function name suffices indicate which LLR to convert. - */ -static inline int pij_gassist_llrtopij1_a(MATRIXF* d,const MATRIXG* g,size_t nv,char nodiag,long nodiagshift) -{ - LOG(9,"Converting LLR to probabilities for step 1 on per A basis.") - assert(g->size2>2); - return pij_gassist_llrtopij_a_convert_self(d,g,nv,1,1,1,g->size2-2,nodiag,nodiagshift); -} - -static inline int pij_gassist_llrtopij2_a(MATRIXF* d,const MATRIXG* g,size_t nv,char nodiag,long nodiagshift) -{ - LOG(9,"Converting LLR to probabilities for step 2 on per A basis.") - assert(g->size2>2); - return pij_gassist_llrtopij_a_convert_self(d,g,nv,1,1,1,g->size2-2,nodiag,nodiagshift); -} - -static inline int pij_gassist_llrtopij3_a(MATRIXF* d,const MATRIXG* g,size_t nv,char nodiag,long nodiagshift) -{ - LOG(9,"Converting LLR to probabilities for step 3 on per A basis.") - assert(g->size2>3); - if(pij_gassist_llrtopij_a_convert_self(d,g,nv,1,1,1,g->size2-3,nodiag,nodiagshift)) - return 1; - MATRIXFF(scale)(d,-1); - MATRIXFF(add_constant)(d,1); - return 0; -} - -static inline int pij_gassist_llrtopij4_a(MATRIXF* d,const MATRIXG* g,size_t nv,char nodiag,long nodiagshift) -{ - LOG(9,"Converting LLR to probabilities for step 4 on per A basis.") - assert(g->size2>3); - return pij_gassist_llrtopij_a_convert_self(d,g,nv,1,2,1,g->size2-3,nodiag,nodiagshift); -} - -static inline int pij_gassist_llrtopij5_a(MATRIXF* d,const MATRIXG* g,size_t nv,char nodiag,long nodiagshift) -{ - LOG(9,"Converting LLR to probabilities for step 5 on per A basis.") - assert(g->size2>3); - if(pij_gassist_llrtopij_a_convert_self(d,g,nv,0,1,1,g->size2-3,nodiag,nodiagshift)) - return 1; - return 0; -} - - -int pij_gassist_llrtopijs_a(const MATRIXG* g,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag,long nodiagshift) -{ - int ret=0,ret2=0; - if(g->size2<=3) - { - LOG(0,"Needs at least 4 samples to compute probabilities.") - return 1; - } - ret=ret||(ret2=pij_gassist_llrtopij2_a(p2,g,nv,0,0)); - if(ret2) - LOG(1,"Failed to log likelihood ratios to probabilities in step 2.") - //For p1, if nodiag, copy p2 data, otherwise set all to 1. - if(nodiag) - { - VECTORFF(view) vv; - vv=MATRIXFF(superdiagonal)(p2,(size_t)nodiagshift); - ret=(ret2=VECTORFF(memcpy)(p1,&vv.vector)); - } - else - ret=(ret2=pij_gassist_llrtopij1_1(p1)); - if(ret2) - LOG(1,"Failed to log likelihood ratios to probabilities in step 1.") - ret=ret||(ret2=pij_gassist_llrtopij3_a(p3,g,nv,nodiag,nodiagshift)); - if(ret2) - LOG(1,"Failed to log likelihood ratios to probabilities in step 3.") - ret=ret||(ret2=pij_gassist_llrtopij4_a(p4,g,nv,nodiag,nodiagshift)); - if(ret2) - LOG(1,"Failed to log likelihood ratios to probabilities in step 4.") - ret=ret||(ret2=pij_gassist_llrtopij5_a(p5,g,nv,nodiag,nodiagshift)); - if(ret2) - LOG(1,"Failed to log likelihood ratios to probabilities in step 5.") - return ret; -} - - - - - - - - - - - diff --git a/pij/gassist/llrtopij_a.h b/pij/gassist/llrtopij_a.h deleted file mode 100644 index 0c75c77..0000000 --- a/pij/gassist/llrtopij_a.h +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This file contains the conversion from log likelihood ratio to probabilities - * - */ - -#ifndef _HEADER_LIB_PIJ_GASSIST_LLRTOPIJ_A_H_ -#define _HEADER_LIB_PIJ_GASSIST_LLRTOPIJ_A_H_ -#include "../../base/config.h" -#include "../../base/types.h" -#include "../llrtopij_a.h" -#ifdef __cplusplus -extern "C" -{ -#endif - -/* Convert real log likelihood ratios into probability functions. - * This function converts every A in hypothesis (E->A->B) separately. - * Suppose there are ng (E,A) pairs and nt Bs, this function converts ng times, - * each for one (E,A) pair but all Bs. - * d: (ng,nt) Input log likelihood ratios for construction of - * histograms and calculation of probability of true hypothesis. - * dconv: (ng,nx) Actual log likelihood ratios to be converted to - * probabilities using histogram constructed base on d. - * ans: (ng,nx) Output matrix for probabilities. - * g: (ng,ns) Original genotype matrix, used for analytical calculation - * of null distribution. Every element=0,1,...,nv-1. - * nv: Maximum number of values each g may take. - * n1c, - * n1d, - * n2c, - * n2d: Parameters to specify null distribution. See pij_llrtopij_a_nullhist - * nodiag: If diagonal elements of d should be removed in construction of real - * histogram. This should be set to true (!=0) when t is identical with - * the top rows of t2 (in calculation of llr). - * Return: 0 if success. - */ -int pij_gassist_llrtopij_a_convert(const MATRIXF* d,const MATRIXF* dconv,MATRIXF* ans,const MATRIXG* g,size_t nv,long n1c,size_t n1d,long n2c,size_t n2d,char nodiag,long nodiagshift); - -int pij_gassist_llrtopij_a_convert_self(MATRIXF* d,const MATRIXG* g,size_t nv,long n1c,size_t n1d,long n2c,size_t n2d,char nodiag,long nodiagshift); - -/* Converts four LLRs into probabilities together. - * Uses pij_gassist_llrtopij1_a to pij_gassist_llrtopij5_a. - * See above functions for parameter definitions. - * Return: 0 if all functions are successful. - */ -int pij_gassist_llrtopijs_a(const MATRIXG* g,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag,long nodiagshift); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/gassist/llrtopij_tot.c b/pij/gassist/llrtopij_tot.c deleted file mode 100644 index 2a65e40..0000000 --- a/pij/gassist/llrtopij_tot.c +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "../../base/config.h" -#include -#include -#include -#include -#include -#include "../../base/gsl/math.h" -#include "../../base/gsl/histogram.h" -#include "../../base/gsl/blas.h" -#include "../../base/logger.h" -#include "../../base/macros.h" -#include "../../base/data_process.h" -#include "../../base/threading.h" -#include "../../base/histogram.h" -#include "../nullhist.h" -#include "nullhist.h" -#include "llrtopij_tot.h" -#include "llrtopij.h" -#pragma GCC diagnostic ignored "-Wunused-parameter" - - -int pij_gassist_llrtopij1_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio) -{ - const struct histogram_unequalbins_exp_param ph={g->size2-nv,1}; - const struct pij_gassist_nullhist_analytical_pdf_param pnh={g,nv,5}; - return pij_llrtopij_tot_convert(llr,llr,p,histogram_unequalbins_exp,pij_gassist_nullhist_analytical1_pdf,&ph,&pnh,nratio); -} - -int pij_gassist_llrtopij2_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio) -{ - const struct histogram_unequalbins_exp_param ph={g->size2-nv,1}; - const struct pij_gassist_nullhist_analytical_pdf_param pnh={g,nv,5}; - return pij_llrtopij_tot_convert(llr,llr,p,histogram_unequalbins_exp,pij_gassist_nullhist_analytical2_pdf,&ph,&pnh,nratio); -} - -int pij_gassist_llrtopij3_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio) -{ -#define CLEANUP - const struct histogram_unequalbins_exp_param ph={g->size2-nv,1}; - const struct pij_gassist_nullhist_analytical_pdf_param pnh={g,nv,5}; - if(pij_llrtopij_tot_convert(llr,llr,p,histogram_unequalbins_exp,pij_gassist_nullhist_analytical3_pdf,&ph,&pnh,nratio)) - ERRRET("Conversion failed.") - VECTORFF(scale)(p,-1); - VECTORFF(add_constant)(p,1); - return 0; -#undef CLEANUP -} - -int pij_gassist_llrtopij4_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio) -{ - const struct histogram_unequalbins_exp_param ph={g->size2-nv,1}; - const struct pij_gassist_nullhist_analytical_pdf_param pnh={g,nv,5}; - return pij_llrtopij_tot_convert(llr,llr,p,histogram_unequalbins_exp,pij_gassist_nullhist_analytical4_pdf,&ph,&pnh,nratio); -} - -int pij_gassist_llrtopij5_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio) -{ - const struct histogram_unequalbins_exp_param ph={g->size2-nv,1}; - const struct pij_gassist_nullhist_analytical_pdf_param pnh={g,nv,5}; - return pij_llrtopij_tot_convert(llr,llr,p,histogram_unequalbins_exp,pij_gassist_nullhist_analytical5_pdf,&ph,&pnh,nratio); -} - -int pij_gassist_llrtopijs_tot(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,const VECTORF* llr1,const MATRIXF* llr2,const MATRIXF* llr3,const MATRIXF* llr4,const MATRIXF* llr5,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag) -{ -#define CLEANUP CLEANVECF(vp)CLEANVECF(tp) - int ret; - size_t ng,n; - double nratio; //Ratio of null distribution - VECTORF *vp,*tp; - - //Initialize - vp=tp=0; - ng=t->size1; - n=ng*t2->size1; - if(nodiag) - n-=GSL_MIN(ng,t2->size1); - vp=VECTORFF(alloc)(n); - tp=VECTORFF(alloc)(n); - if(!(vp&&tp)) - ERRRET("Not enough memory.") - - if(VECTORFF(first_nan)(llr1)>=0) - LOG(4,"Infinity found for log likelihood ratio, possibly because data contains fully correlated/anticorrelated columns. This may affect downstream analyses.") - ret=pij_gassist_llrtopij1_1(p1); - if(ret) - ERRRET("Failed to calculate probabilities in step 1.") - - if(nodiag) - MATRIXFF(flatten_nodiag)(llr2,tp); - else - MATRIXFF(flatten)(llr2,tp); - if(VECTORFF(first_nan)(tp)>=0) - LOG(4,"Infinity found for log likelihood ratio, possibly because data contains fully correlated/anticorrelated columns. This may affect downstream analyses.") - ret=pij_gassist_llrtopij2_tot(g,tp,vp,nv,&nratio); - if(ret) - ERRRET("Failed to calculate probabilities in step 2.") - if(nodiag) - VECTORFF(wrap_nodiag)(vp,p2); - else - VECTORFF(wrap)(vp,p2); - - if(nodiag) - MATRIXFF(flatten_nodiag)(llr3,tp); - else - MATRIXFF(flatten)(llr3,tp); - if(VECTORFF(first_nan)(tp)>=0) - { - LOG(4,"Infinity found for log likelihood ratio, possibly because data contains fully correlated/anticorrelated columns. This may affect downstream analyses.") - VECTORFF(set_nan)(tp,0); - } - ret=pij_gassist_llrtopij3_tot(g,tp,vp,nv,&nratio); - if(ret) - ERRRET("Failed to calculate probabilities in step 3.") - if(nodiag) - VECTORFF(wrap_nodiag)(vp,p3); - else - VECTORFF(wrap)(vp,p3); - - if(nodiag) - MATRIXFF(flatten_nodiag)(llr4,tp); - else - MATRIXFF(flatten)(llr4,tp); - if(VECTORFF(first_nan)(tp)>=0) - LOG(4,"Infinity found for log likelihood ratio, possibly because data contains fully correlated/anticorrelated columns. This may affect downstream analyses.") - ret=pij_gassist_llrtopij4_tot(g,tp,vp,nv,&nratio); - if(ret) - ERRRET("Failed to calculate probabilities in step 4.") - if(nodiag) - VECTORFF(wrap_nodiag)(vp,p4); - else - VECTORFF(wrap)(vp,p4); - - if(nodiag) - MATRIXFF(flatten_nodiag)(llr4,tp); - else - MATRIXFF(flatten)(llr4,tp); - if(VECTORFF(first_nan)(tp)>=0) - LOG(4,"Infinity found for log likelihood ratio, possibly because data contains fully correlated/anticorrelated columns. This may affect downstream analyses.") - ret=pij_gassist_llrtopij5_tot(g,tp,vp,nv,&nratio); - if(ret) - ERRRET("Failed to calculate probabilities in step 5.") - if(nodiag) - VECTORFF(wrap_nodiag)(vp,p5); - else - VECTORFF(wrap)(vp,p5); - - //Free memory - CLEANUP - return 0; -#undef CLEANUP -} - - - - - - - - - - - - - - - - diff --git a/pij/gassist/llrtopij_tot.h b/pij/gassist/llrtopij_tot.h deleted file mode 100644 index e87c811..0000000 --- a/pij/gassist/llrtopij_tot.h +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This file contains the conversion from log likelihood ratio to probabilities - */ - -#ifndef _HEADER_LIB_PIJ_GASSIST_LLRTOPIJ_TOT_H_ -#define _HEADER_LIB_PIJ_GASSIST_LLRTOPIJ_TOT_H_ -#include "../../base/config.h" -#include "../../base/gsl/histogram.h" -#include "../../base/types.h" -#include "../llrtopij_tot.h" - -#ifdef __cplusplus -extern "C" -{ -#endif - -/* The recommended wrapper conversion methods of step 1-5 data. Uses analytical generation of pdfs of null distribution. - * g: (ng,ns) Genotype data. - * llr: Log likelihood ratios of real data - * p: Output matrix of converted probabilities - * nv: number of possible genotype values for each SNP - * nratio: if not NULL, return the ratio of null distribution in real data. - * Return: 0 on success - */ -int pij_gassist_llrtopij1_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio); -int pij_gassist_llrtopij2_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio); -int pij_gassist_llrtopij3_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio); -int pij_gassist_llrtopij4_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio); -int pij_gassist_llrtopij5_tot(const MATRIXG* g,const VECTORF* llr,VECTORF* p,size_t nv,double* nratio); - - -/* Calculates null log likelihood ratios from permuted data, and then convert log likelihood ratios of real data - * into probabilities. For step 3, permute A and B. - * g: (ng,ns) Full genotype data - * t: (ng,ns) Full transcript data of A. Each rows best eQTL must be the same row of g - * t2: (nt,ns) Full transcript data of B. - * llr1: (ng) Log likelihood ratios of real data for step 1 - * llr2: (ng,nt) Log likelihood ratios of real data for step 2 - * llr3: (ng,nt) Log likelihood ratios of real data for step 3 - * llr4: (ng,nt) Log likelihood ratios of real data for step 4 - * llr5: (ng,nt) Log likelihood ratios of real data for step 5 - * p1: (ng) Output for converted probabilities of step 1 - * p2: (ng,nt) Output for converted probabilities of step 2 - * p3: (ng,nt) Output for converted probabilities of step 3 - * p4: (ng,nt) Output for converted probabilities of step 4 - * p5: (ng,nt) Output for converted probabilities of step 5 - * ans: (ng,nt) Output matrix of converted probabilities - * nv: number of possible genotype values for each SNP - * nodiag: When the top ng rows of t2 is exactly t, diagonals of p2 and p3 are meaningless. - * In this case, set nodiag to 1 to avoid inclusion of NANs. For nodiag=0, t and t2 - * should not have any identical genes. - * Return: 0 on success - */ -int pij_gassist_llrtopijs_tot(const MATRIXG* g,const MATRIXF* t,const MATRIXF* t2,const VECTORF* llr1,const MATRIXF* llr2,const MATRIXF* llr3,const MATRIXF* llr4,const MATRIXF* llr5,VECTORF* p1,MATRIXF* p2,MATRIXF* p3,MATRIXF* p4,MATRIXF* p5,size_t nv,char nodiag); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/gassist/llrtopv.c b/pij/gassist/llrtopv.c index 29ef7f3..ad3b708 100644 --- a/pij/gassist/llrtopv.c +++ b/pij/gassist/llrtopv.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -34,7 +34,7 @@ * n1c, * n1d, * n2c, - * n2d: Parameters to specify null distribution. See pij_llrtopij_a_nullhist + * n2d: Parameters to specify null distribution. See pij_nullhist * Return: 0 if success. */ static int pij_gassist_llrtopv_block(MATRIXF* d,const MATRIXG* g,size_t nv,long n1c,size_t n1d,long n2c,size_t n2d) @@ -81,7 +81,7 @@ static int pij_gassist_llrtopv_block(MATRIXF* d,const MATRIXG* g,size_t nv,long * n1c, * n1d, * n2c, - * n2d: Parameters to specify null distribution. See pij_llrtopij_a_nullhist + * n2d: Parameters to specify null distribution. See pij_nullhist * Return: 0 if success. */ static inline int pij_gassist_llrtopv_vec_block(VECTORF* d,const MATRIXG* g,size_t nv,long n1c,size_t n1d,long n2c,size_t n2d) diff --git a/pij/gassist/llrtopv.h b/pij/gassist/llrtopv.h index 2a650b3..bff394e 100644 --- a/pij/gassist/llrtopv.h +++ b/pij/gassist/llrtopv.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/pij/gassist/nulldist.c b/pij/gassist/nulldist.c deleted file mode 100644 index e5e0ac1..0000000 --- a/pij/gassist/nulldist.c +++ /dev/null @@ -1,310 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "../../base/config.h" -#include -#include -#include -#include -#include -#include "../../base/gsl/blas.h" -#include "../../base/gsl/math.h" -#include "../../base/gsl/sf.h" -#include "../../base/logger.h" -#include "../../base/macros.h" -#include "../../base/histogram.h" -#include "../../base/math.h" -#include "../../base/data_process.h" -#include "../nulldist.h" -#include "nulldist.h" -#pragma GCC diagnostic ignored "-Wunused-parameter" - - - -/************************************************************* - * Generic functions for any step - * - * Mixture distributions of LLR for any step at specific locations. - * The distribution is a mixture one because different genotype locations - * can have different number - * of values observed, yielding to different analytical distributions of LLR. - *************************************************************/ - -/* Calculate log total mixture pdf of LLR for any step at specific locations with buffer provided. - * The distribution is a mixture one because different genotype locations can have different number - * of values observed, yielding to different analytical distributions of LLR. - * Specify function for different steps. - * g: (ng,ns) Genotype data - * nv: Number of values each genotype can take - * loc: (nd) Locations of LLR of current step, where log pdf will be calculated - * ans: (nd) Output for log pdf calculated. - * vb2: (nd) Buffer - * vb3: (ng) Buffer - * vb4: (nv+1) Buffer - * mb1: (nv-1,nd) Saves pdf before summing up w.r.t nv. - * nd: loc->size - * func:Function to calculate pdf buffed (e.g. pij_nulldist1_calcpdf_buffed or pij_nulldist3_calcpdf_buffed) - */ -static void pij_gassist_nulldist_mixed_pdf_buffed(const MATRIXG* g,size_t nv,const VECTORD* loc,VECTORD* ans,VECTORD* vb2,VECTORG* vb3,VECTORD* vb4,VECTORUC* vb5,MATRIXD* mb1,void (*func)(size_t,size_t,const VECTORD*,MATRIXD*,VECTORD*)) -{ - size_t ns=g->size2; - - assert(loc->size&&(ans->size==loc->size)&&(vb2->size==loc->size)&&(mb1->size2==loc->size)); - assert((nv>1)&&(mb1->size1==(nv-1))); - - //First calculate separate log pdfs - func(ns,nv,loc,mb1,vb2); - - //Count alleles for each genotype - MATRIXGF(countv_byrow_buffed)(g,vb3,vb5); - VECTORGF(count_ratio_d)(vb3,vb4); - - //Calculate averaged pdf w.r.t genotype - if(VECTORDF(get)(vb4,1)!=0) - LOG(1,"Found genotype that has single value across all samples. Ignored.") - - VECTORDF(const_view) vvc=VECTORDF(const_subvector)(vb4,2,(size_t)nv-1); - gsl_blas_dgemv(CblasTrans,1/(1-VECTORDF(get)(vb4,1)),mb1,&vvc.vector,0,ans); -} - -/* Calculate log pdf of LLR for any step at specific locations. - * g: (ng,ns) Genotype data - * nv: Number of values each genotype can take - * loc: (nd) Locations of LLR of current step, where log pdf will be calculated - * ans: (nd) Output for log pdf calculated. - * nd: loc->size - * func:Function to calculate pdf buffed (e.g. pij_nulldist1_calcpdf_buffed or pij_nulldist3_calcpdf_buffed) - * Return: 0 on success. - */ -static int pij_gassist_nulldist_mixed_pdf(const MATRIXG* g,size_t nv,const VECTORD* loc,VECTORD* ans,void (*func)(size_t,size_t,const VECTORD*,MATRIXD*,VECTORD*)) -{ -#define CLEANUP AUTOFREEVEC(vb2)AUTOFREEVEC(vb4)AUTOFREEVEC(vb4) - size_t ng=g->size1,nd=loc->size; - - VECTORG *vb3=VECTORGF(alloc)(ng); - MATRIXD *mb1=MATRIXDF(alloc)(nv-1,nd); - AUTOALLOCVECD(vb2,nd,10000) - AUTOALLOCVECD(vb4,nv+1,1000) - AUTOALLOCVECUC(vb5,nv,1000) - - if(!(vb2&&vb3&&vb4&&vb5&&mb1)) - ERRRET("Not enough memory.") - - pij_gassist_nulldist_mixed_pdf_buffed(g,nv,loc,ans,vb2,vb3,vb4,vb5,mb1,func); - CLEANUP - return 0; -#undef CLEANUP -} - -/* Calculates density histogram of null distribution of log likelihood ratio. - * Is a pij_allrtopij_nullhist_method (see pij_allrtopij.h). - * Method is to use central pdf value as the density. - * g: (ng,ns) Genotype data - * nv: Number of values each genotype can take - * range: (nbin+1) Bin boundary values - * nbin: Number of bins - * hist: (nbin) Output array for density histogram. - * param: Redundant, must be 0. - * func: Function to calculate pdf buffed (e.g. pij_nulldist1_calcpdf_buffed or pij_nulldist3_calcpdf_buffed) - * Return: 0 on success. - */ -static int pij_gassist_nulldist_nullhist_mixed_pdf0(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param,void (*func)(size_t,size_t,const VECTORD*,MATRIXD*,VECTORD*)) -{ -#define CLEANUP AUTOFREEVEC(vloc) - int ret; - VECTORDF(view) vvh=VECTORDF(view_array)(hist,nbin); - AUTOALLOCVECD(vloc,nbin,1000) - - assert(!param); - if(!vloc) - ERRRET("Not enough memory.") - //Obtain bin central values - { - VECTORDF(const_view) vv1=VECTORDF(const_view_array)(range+1,nbin); - VECTORDF(memcpy)(vloc,&vv1.vector); - } - { - VECTORDF(const_view) vv1=VECTORDF(const_view_array)(range,nbin); - VECTORDF(add)(vloc,&vv1.vector); - } - VECTORDF(scale)(vloc,0.5); - ret=pij_gassist_nulldist_mixed_pdf(g,nv,vloc,&vvh.vector,func); - CLEANUP - return ret; -#undef CLEANUP -} - -/* Calculates density histogram of null distribution of log likelihood ratio. - * Method is to use central pdf value as the density. - * g: (ng,ns) Genotype data - * nv: Number of values each genotype can take - * range: (nbin+1) Bin boundary values - * nbin: Number of bins - * hist: (nbin) Output array for density histogram. - * param: Redundant, must be 0. - * func: Function to calculate pdf buffed (e.g. pij_nulldist1_calcpdf_buffed or pij_nulldist3_calcpdf_buffed) - * Return: 0 on success. - */ -static int pij_gassist_nulldist_nullhist_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param,void (*func)(size_t,size_t,const VECTORD*,MATRIXD*,VECTORD*)) -{ -#define CLEANUP CLEANVECD(loc)CLEANVECD(val) - VECTORD *loc,*val; - VECTORDF(view) vvh=VECTORDF(view_array)(hist,nbin); - size_t *n=param; - size_t i; - size_t nsp; - - assert(n&&(*n<10)); - if(!*n) - return pij_gassist_nulldist_nullhist_mixed_pdf0(g,nv,range,nbin,hist,param,func); - nsp=(size_t)1<<(*n-1); - loc=VECTORDF(alloc)(nbin*nsp); - val=VECTORDF(alloc)(nbin*nsp); - if(!(loc&&val)) - ERRRET("Not enough memory.") - - //Construct bin ranges - { - VECTORDF(const_view) vvc=VECTORDF(const_view_array)(range,nbin+1); - histogram_finer_central(&vvc.vector,loc,nsp); - } - - //Calculate bin values - if(pij_gassist_nulldist_mixed_pdf(g,nv,loc,val,func)) - { - CLEANUP - return 1; - } - - //Shrink to output - VECTORDF(set_zero)(&vvh.vector); - for(i=0;ig,p->nv,loc,ans,pij_gassist_nulldist1_calcpdf_buffed); -} - -int pij_gassist_nulldist2_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param) -{ - const struct pij_gassist_nulldist_mixed_pdf_data *p=param; - return pij_gassist_nulldist_mixed_pdf(p->g,p->nv,loc,ans,pij_gassist_nulldist2_calcpdf_buffed); -} - -int pij_gassist_nulldist3_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param) -{ - const struct pij_gassist_nulldist_mixed_pdf_data *p=param; - return pij_gassist_nulldist_mixed_pdf(p->g,p->nv,loc,ans,pij_gassist_nulldist3_calcpdf_buffed); -} - -int pij_gassist_nulldist4_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param) -{ - const struct pij_gassist_nulldist_mixed_pdf_data *p=param; - return pij_gassist_nulldist_mixed_pdf(p->g,p->nv,loc,ans,pij_gassist_nulldist4_calcpdf_buffed); -} - -int pij_gassist_nulldist5_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param) -{ - const struct pij_gassist_nulldist_mixed_pdf_data *p=param; - return pij_gassist_nulldist_mixed_pdf(p->g,p->nv,loc,ans,pij_gassist_nulldist5_calcpdf_buffed); -} - -int pij_gassist_nulldist_nullhist1_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param) -{ - return pij_gassist_nulldist_nullhist_mixed_pdf(g,nv,range,nbin,hist,param,pij_gassist_nulldist1_calcpdf_buffed); -} - -int pij_gassist_nulldist_nullhist2_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param) -{ - return pij_gassist_nulldist_nullhist_mixed_pdf(g,nv,range,nbin,hist,param,pij_gassist_nulldist2_calcpdf_buffed); -} - -int pij_gassist_nulldist_nullhist3_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param) -{ - return pij_gassist_nulldist_nullhist_mixed_pdf(g,nv,range,nbin,hist,param,pij_gassist_nulldist3_calcpdf_buffed); -} - -int pij_gassist_nulldist_nullhist4_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param) -{ - return pij_gassist_nulldist_nullhist_mixed_pdf(g,nv,range,nbin,hist,param,pij_gassist_nulldist4_calcpdf_buffed); -} - -int pij_gassist_nulldist_nullhist5_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param) -{ - return pij_gassist_nulldist_nullhist_mixed_pdf(g,nv,range,nbin,hist,param,pij_gassist_nulldist5_calcpdf_buffed); -} - - - - - - - - - - - - - - - - - - - - diff --git a/pij/gassist/nulldist.h b/pij/gassist/nulldist.h deleted file mode 100644 index 22b8529..0000000 --- a/pij/gassist/nulldist.h +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This part contains analytical calculation of the distribution of log likelihood ratio from null hypothesis. - * Each function is applicable to one or more stages, which are stated in the function name as pij_nulldistX_..., where X is the applicable stage. - * For each stage, different methods to calculate histogram can coexist. The method is declared in the function name as suffix: - * _cdf: Calculate histogram as the difference of cdf. - * This is applicable when distribution is single-variable integrable. - * _pdf: Calculate histogram as the pdf mean of points evenly split within the bin. This is applicable when distribution is single-variable non-integrable. - * _sim: Construct histogram by sampling. This is applicable when distribution is multi-variable non-integrable. - */ - -#ifndef _HEADER_LIB_PIJ_GASSIST_NULLDIST_H_ -#define _HEADER_LIB_PIJ_GASSIST_NULLDIST_H_ -#include "../../base/config.h" -#include "../../base/types.h" -#ifdef __cplusplus -extern "C" -{ -#endif - - -/************************************************************* - * Specific functions for each step - *************************************************************/ - -/* Calculate log pdf of LLR for each step at specific locations with buffer provided. - * Uses pij_nulldist_calcpdf_buffed. - * ns: Number of samples - * nv: Number of values each genotype can take. Calculates for value counts=2,...,nv. - * loc: (nd) Locations of LLR of each step, where log pdf will be calculated - * ans: (nv-1,nd) Output matrix. ans[i,j]=p(loc[j],ns,i+2). - * vb2: (nd) Buffer. - */ -void pij_gassist_nulldist1_calcpdf_buffed(size_t ns,size_t nv,const VECTORD* loc,MATRIXD* ans,VECTORD* vb2); -void pij_gassist_nulldist2_calcpdf_buffed(size_t ns,size_t nv,const VECTORD* loc,MATRIXD* ans,VECTORD* vb2); -void pij_gassist_nulldist3_calcpdf_buffed(size_t ns,size_t nv,const VECTORD* loc,MATRIXD* ans,VECTORD* vb2); -void pij_gassist_nulldist4_calcpdf_buffed(size_t ns,size_t nv,const VECTORD* loc,MATRIXD* ans,VECTORD* vb2); -void pij_gassist_nulldist5_calcpdf_buffed(size_t ns,size_t nv,const VECTORD* loc,MATRIXD* ans,VECTORD* vb2); - - -struct pij_gassist_nulldist_mixed_pdf_data -{ - const MATRIXG* g; - size_t nv; -}; - -/* Calculate log pdf of LLR for each step at specific locations using pij_nulldist_mixed_pdf. - * Steps 1 and 2_conserv are identical. - * g: (ng,ns) Genotype data - * nv: Number of values each genotype can take - * loc: (nd) Locations of LLR of current step, where log pdf will be calculated - * ans: (nd) Output for log pdf calculated. - * Return: 0 on success. - */ -int pij_gassist_nulldist1_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param); -int pij_gassist_nulldist2_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param); -int pij_gassist_nulldist3_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param); -int pij_gassist_nulldist4_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param); -int pij_gassist_nulldist5_mixed_pdf(const VECTORD* loc,VECTORD* ans,const void* param); - -// #define pij_nulldist2_nullhist_pdf0 pij_nulldist1_nullhist_pdf0 -/* Calculates density histogram of null distribution of log likelihood ratio. - * Steps 1 and 2_conserv are identical. - * Uses pij_nulldist_nullhist_pdf. - * Method is to use central pdf value as the density. - * g: (ng,ns) Genotype data - * nv: Number of values each genotype can take - * range: (nbin+1) Bin boundary values - * nbin: Number of bins - * hist: (nbin) Output array for density histogram. - * param: Redundant, must be 0. - * Return: 0 on success. - */ -int pij_gassist_nulldist_nullhist1_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param); -int pij_gassist_nulldist_nullhist2_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param); -int pij_gassist_nulldist_nullhist3_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param); -int pij_gassist_nulldist_nullhist4_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param); -int pij_gassist_nulldist_nullhist5_mixed_pdf(const MATRIXG* g,size_t nv,const double* restrict range,size_t nbin,double* restrict hist,void* param); - - - - - - - - - - - - - - - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/gassist/nullhist.c b/pij/gassist/nullhist.c index 3c9e326..280b4df 100644 --- a/pij/gassist/nullhist.c +++ b/pij/gassist/nullhist.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -19,62 +19,25 @@ #include #include #include -#include -#include "../../base/gsl/blas.h" #include "../../base/logger.h" -#include "../../base/macros.h" -#include "../../base/histogram.h" +#include "../../base/gsl/histogram.h" #include "../nullhist.h" #include "nullhist.h" -#include "nulldist.h" - -int pij_gassist_nullhist_analytical1_pdf(const void* param,gsl_histogram* h) -{ - const struct pij_gassist_nullhist_analytical_pdf_param *p=param; - const struct pij_gassist_nulldist_mixed_pdf_data p2={p->g,p->nv}; - const struct pij_nullhist_analytical_pdf_param p3={p->nsplit,pij_gassist_nulldist1_mixed_pdf,&p2}; - - return pij_nullhist_analytical_pdf(&p3,h); -} - -int pij_gassist_nullhist_analytical2_pdf(const void* param,gsl_histogram* h) -{ - const struct pij_gassist_nullhist_analytical_pdf_param *p=param; - const struct pij_gassist_nulldist_mixed_pdf_data p2={p->g,p->nv}; - const struct pij_nullhist_analytical_pdf_param p3={p->nsplit,pij_gassist_nulldist2_mixed_pdf,&p2}; - - return pij_nullhist_analytical_pdf(&p3,h); -} - -int pij_gassist_nullhist_analytical3_pdf(const void* param,gsl_histogram* h) +int pij_gassist_nullhists(gsl_histogram** h[4],size_t nt,size_t ns,size_t nv,const FTYPE dmax[4]) { - const struct pij_gassist_nullhist_analytical_pdf_param *p=param; - const struct pij_gassist_nulldist_mixed_pdf_data p2={p->g,p->nv}; - const struct pij_nullhist_analytical_pdf_param p3={p->nsplit,pij_gassist_nulldist3_mixed_pdf,&p2}; - - return pij_nullhist_analytical_pdf(&p3,h); + //Construct null density histograms + h[0]=pij_nullhist((double)dmax[0],nv,nt,1,1,1,ns-2); + h[1]=pij_nullhist((double)dmax[1],nv,nt,1,1,1,ns-3); + h[2]=pij_nullhist((double)dmax[2],nv,nt,1,2,1,ns-3); + h[3]=pij_nullhist((double)dmax[3],nv,nt,0,1,1,ns-3); + if(h[0]&&h[1]&&h[2]&&h[3]) + return 0; + + LOG(1,"pij_nullhist failed.") + return 1; } -int pij_gassist_nullhist_analytical4_pdf(const void* param,gsl_histogram* h) -{ - const struct pij_gassist_nullhist_analytical_pdf_param *p=param; - const struct pij_gassist_nulldist_mixed_pdf_data p2={p->g,p->nv}; - const struct pij_nullhist_analytical_pdf_param p3={p->nsplit,pij_gassist_nulldist4_mixed_pdf,&p2}; - - return pij_nullhist_analytical_pdf(&p3,h); -} - -int pij_gassist_nullhist_analytical5_pdf(const void* param,gsl_histogram* h) -{ - const struct pij_gassist_nullhist_analytical_pdf_param *p=param; - const struct pij_gassist_nulldist_mixed_pdf_data p2={p->g,p->nv}; - const struct pij_nullhist_analytical_pdf_param p3={p->nsplit,pij_gassist_nulldist5_mixed_pdf,&p2}; - - return pij_nullhist_analytical_pdf(&p3,h); -} - - diff --git a/pij/gassist/nullhist.h b/pij/gassist/nullhist.h index 05fde9d..d9cc65a 100644 --- a/pij/gassist/nullhist.h +++ b/pij/gassist/nullhist.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -29,26 +29,17 @@ extern "C" { #endif -struct pij_gassist_nullhist_analytical_pdf_param -{ - //(ng,ns) Genotype data - const MATRIXG* g; - //Maximum number of possible values each genotype can take. - //=Number of alleles + 1 - size_t nv; - /* Number of split within each bin. 2^nsplit central points are - * taken and the mean is calculated as the average pdf within the bin. - */ - size_t nsplit; -}; - - -//Specific interface function for different stages -int pij_gassist_nullhist_analytical1_pdf(const void* param,gsl_histogram* h); -int pij_gassist_nullhist_analytical2_pdf(const void* param,gsl_histogram* h); -int pij_gassist_nullhist_analytical3_pdf(const void* param,gsl_histogram* h); -int pij_gassist_nullhist_analytical4_pdf(const void* param,gsl_histogram* h); -int pij_gassist_nullhist_analytical5_pdf(const void* param,gsl_histogram* h); +/* Produce null histograms for all tests (2 to 5). + * h: Output location of null histograms. 0 to 3 for tests 2 to 5. + * nt: Number of targets + * ns: Number of samples + * nv: Number of values, = number of alleles + 1 + * dmax: Maximum value of all LLRs, for histogram construction. + * It can be larger than the maximum of d, if memlimit is not infinite. + * 0 to 4 for tests 1 to 5. + * Return: 0 on success and 1 otherwise + */ +int pij_gassist_nullhists(gsl_histogram** h[4],size_t nt,size_t ns,size_t nv,const FTYPE dmax[4]); diff --git a/pij/llrtopij.c b/pij/llrtopij.c index 1a21fa0..d3e6962 100644 --- a/pij/llrtopij.c +++ b/pij/llrtopij.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -28,11 +28,11 @@ #include "../base/macros.h" #include "../base/data_process.h" #include "../base/histogram.h" -#include "nullmodeler.h" -#include "nullsampler.h" +#include "../base/threading.h" #include "nullhist.h" #include "llrtopij.h" -#pragma GCC diagnostic ignored "-Wunused-parameter" + +// #pragma GCC diagnostic ignored "-Wunused-parameter" /* Smoothen true ratio histogram to make it monotonically increasing * Method: 1. Construct increasing upper bound histogram (hup) @@ -289,6 +289,289 @@ int pij_llrtopij_convert_histograms(gsl_histogram* hreal,VECTORD* vnull,gsl_hist #undef CLEANUP } +FTYPE pij_llrtopij_llrmatmax(MATRIXF* d,char nodiag) +{ + FTYPE dmin,dmax; + if(nodiag) + MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,0); + else + MATRIXFF(minmax)(d,&dmin,&dmax); + if((!(dmin>=0))||gsl_isnan(dmax)) + { + LOG(1,"Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") + return 0; + } + if(gsl_isinf(dmax)) + { + LOG(5,"INF found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps or duplicate rows. Now regard INFs as largest non-INF value.") + MATRIXFF(set_inf)(d,-1); + if(nodiag) + MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,0); + else + dmax=MATRIXFF(max)(d); + MATRIXFF(set_value)(d,-1,dmax); + } + return dmax; +} + +FTYPE pij_llrtopij_llrvecmax(VECTORF* d) +{ + FTYPE dmin,dmax; + VECTORFF(minmax)(d,&dmin,&dmax); + if((!(dmin>=0))||gsl_isnan(dmax)) + { + LOG(1,"Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") + return 0; + } + if(gsl_isinf(dmax)) + { + LOG(5,"INF found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps or duplicate rows. Now regard INFs as largest non-INF value.") + VECTORFF(set_inf)(d,-1); + dmax=VECTORFF(max)(d); + VECTORFF(set_value)(d,-1,dmax); + } + return dmax; +} + + +int pij_llrtopij_convert_single(const MATRIXF* d,const MATRIXF* dconv,MATRIXF* ans,size_t n1,size_t n2,char nodiag,long nodiagshift) +{ +#define CLEANUP CLEANHIST(hreal)CLEANHIST(hc)\ + CLEANHIST(h)CLEANVECD(vb1)CLEANVECD(vb2) + size_t ng=d->size1; + long k; + size_t j,nbin; + gsl_histogram *h,*hreal,*hc; + VECTORD *vb1,*vb2; + + h=0; + hreal=hc=0; + vb1=0; + vb2=0; + //Validity checks + assert((dconv->size1==ng)&&(ans->size1==ng)&&(ans->size2==dconv->size2)); + + //Construct null density histograms + { + FTYPE dmin,dmax; + MATRIXFF(minmax)(d,&dmin,&dmax); + if((!(dmin>=0))||gsl_isnan(dmax)||gsl_isinf(dmax)) + ERRRET("Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") + h=pij_nullhist_single((double)dmax,d->size2,n1,n2); + if(!h) + ERRRET("pij_nullhist_single failed.") + nbin=h->n; + } + + //Prepare for real histogram + hreal=gsl_histogram_clone(h); + hc=gsl_histogram_alloc(hreal->n+2); + if(!(hreal&&hc)) + ERRRET("Not enough memory."); + if(pij_llrtopij_convert_histograms_make_buffs(nbin,&vb1,&vb2)) + ERRRET("pij_llrtopij_convert_histograms_make_buffs failed.") + + //Conversion + { + VECTORDF(view) vv1,vvreal; + VECTORFF(view) vva; + VECTORD *vwidth,*vnull; + vwidth=VECTORDF(alloc)(nbin); + vnull=VECTORDF(alloc)(nbin); + if(!(vwidth&&vnull)) + { + CLEANVECD(vwidth)CLEANVECD(vnull) + ERRRET("Not enough memory.") + } + vv1=VECTORDF(view_array)(h->range+1,nbin); + VECTORDF(memcpy)(vwidth,&vv1.vector); + vv1=VECTORDF(view_array)(h->range,nbin); + VECTORDF(sub)(vwidth,&vv1.vector); + vvreal=VECTORDF(view_array)(hreal->bin,nbin); + vv1=VECTORDF(view_array)(h->bin,nbin); + for(j=0;jrange,h->range,(nbin+1)*sizeof(*hreal->range)); + memset(hreal->bin,0,nbin*sizeof(*hreal->bin)); + //Construct real histogram + if(nodiag&&((long)j+nodiagshift>=0)&&((long)j+nodiagshift<(long)d->size2)) + { + for(k=(long)j+nodiagshift-1;k>=0;k--) + gsl_histogram_increment(hreal,MATRIXFF(get)(d,j,(size_t)k)); + for(k=(long)j+nodiagshift+1;k<(long)d->size2;k++) + gsl_histogram_increment(hreal,MATRIXFF(get)(d,j,(size_t)k)); + VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2-1)); + } + else + { + for(k=0;k<(long)d->size2;k++) + gsl_histogram_increment(hreal,MATRIXFF(get)(d,j,(size_t)k)); + VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2)); + } + //Convert to density histogram + VECTORDF(div)(&vvreal.vector,vwidth); + //Convert to probability central histogram + pij_llrtopij_convert_histograms_buffed(hreal,vnull,hc,vb1,vb2); + //Convert likelihoods to probabilities + vva=MATRIXFF(row)(ans,j); + pij_llrtopij_histogram_interpolate_linear(hc,&vvd.vector,&vva.vector); + } + CLEANVECD(vwidth)CLEANVECD(vnull) + } + CLEANUP + return 0; +#undef CLEANUP +} + +int pij_llrtopij_convert_single_self(MATRIXF* d,size_t n1,size_t n2,char nodiag,long nodiagshift) +{ +#define CLEANUP CLEANAMHIST(hreal,nth)CLEANAMHIST(hc,nth)\ + CLEANHIST(h)CLEANMATD(mb1)CLEANMATD(mb2)CLEANMATD(mnull)CLEANMATF(mb3)CLEANVECD(vwidth) + + size_t ng=d->size1; + size_t i,nbin; + gsl_histogram *h; + MATRIXD *mb1,*mb2,*mnull; + MATRIXF *mb3; + VECTORD *vwidth; + VECTORDF(view) vv1; + size_t nth; + + h=0; + mb1=mb2=mnull=0; + mb3=0; + vwidth=0; + //Validity checks + { + int nth0=omp_get_max_threads(); + assert(nth0>0); + nth=(size_t)nth0; + } + + + AUTOCALLOC(gsl_histogram*,hreal,nth,64) + AUTOCALLOC(gsl_histogram*,hc,nth,64) + if(!(hreal&&hc)) + ERRRET("Not enough memory."); + + //Construct null density histograms + { + FTYPE dmin,dmax; + if(nodiag) + MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,nodiagshift); + else + MATRIXFF(minmax)(d,&dmin,&dmax); + if((!(dmin>=0))||gsl_isnan(dmax)) + ERRRET("Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") + if(gsl_isinf(dmax)) + { + LOG(5,"INF found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps or duplicate rows (by Spearman correlation).") + MATRIXFF(set_inf)(d,-1); + if(nodiag) + MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,nodiagshift); + else + MATRIXFF(minmax)(d,&dmin,&dmax); + MATRIXFF(set_value)(d,-1,dmax); + } + h=pij_nullhist_single((double)dmax,d->size2,n1,n2); + if(!h) + ERRRET("pij_nullhist_single failed.") + nbin=h->n; + } + //Memory allocation + { + size_t n1,n2; + pij_llrtopij_convert_histograms_get_buff_sizes(nbin,&n1,&n2); + mb1=MATRIXDF(alloc)(nth,n1); + mb2=MATRIXDF(alloc)(nth,n2); + mnull=MATRIXDF(alloc)(nth,nbin); + mb3=MATRIXFF(alloc)(nth,d->size2); + vwidth=VECTORDF(alloc)(nbin); + if(!(mb1&&mb2&&mnull&&mb3&&vwidth)) + ERRRET("Not enough memory.") + } + + //Prepare for real histogram + { + int ret; + for(i=0,ret=1;in+2); + ret=ret&&hreal[i]&&hc[i]; + } + if(!ret) + ERRRET("Not enough memory."); + } + + //Conversion + vv1=VECTORDF(view_array)(h->range+1,nbin); + VECTORDF(memcpy)(vwidth,&vv1.vector); + vv1=VECTORDF(view_array)(h->range,nbin); + VECTORDF(sub)(vwidth,&vv1.vector); + vv1=VECTORDF(view_array)(h->bin,nbin); + #pragma omp parallel + { + size_t ng1,ng2,id; + size_t j; + long k; + VECTORDF(view) vvreal,vvnull,vvb1,vvb2; + VECTORFF(view) vvb3,vva; + + id=(size_t)omp_get_thread_num(); + vvreal=VECTORDF(view_array)(hreal[id]->bin,nbin); + vvnull=MATRIXDF(row)(mnull,id); + vvb1=MATRIXDF(row)(mb1,id); + vvb2=MATRIXDF(row)(mb2,id); + vvb3=MATRIXFF(row)(mb3,id); + + threading_get_startend(ng,&ng1,&ng2); + + for(j=ng1;jrange,h->range,(nbin+1)*sizeof(*hreal[id]->range)); + memset(hreal[id]->bin,0,nbin*sizeof(*hreal[id]->bin)); + //Construct real histogram + if(nodiag&&((long)j+nodiagshift>=0)&&((long)j+nodiagshift<(long)d->size2)) + { + for(k=(long)j+nodiagshift-1;k>=0;k--) + gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); + for(k=(long)j+nodiagshift+1;k<(long)d->size2;k++) + gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); + VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2-1)); + } + else + { + for(k=0;k<(long)d->size2;k++) + gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); + VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2)); + } + + //Convert to density histogram + VECTORDF(div)(&vvreal.vector,vwidth); + //Convert to probability central histogram + pij_llrtopij_convert_histograms_buffed(hreal[id],&vvnull.vector,hc[id],&vvb1.vector,&vvb2.vector); + //Convert likelihoods to probabilities + vva=MATRIXFF(row)(d,j); + pij_llrtopij_histogram_interpolate_linear(hc[id],&vvb3.vector,&vva.vector); + } + } + + CLEANUP + return 0; +#undef CLEANUP +} + + + + + + + diff --git a/pij/llrtopij.h b/pij/llrtopij.h index 8b45b61..05e1755 100644 --- a/pij/llrtopij.h +++ b/pij/llrtopij.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -75,9 +75,37 @@ void pij_llrtopij_convert_histograms_buffed(gsl_histogram* hreal,VECTORD* vnull, int pij_llrtopij_convert_histograms(gsl_histogram* hreal,VECTORD* vnull,gsl_histogram* hc); +/* Obtains the maximum of matrix, possibly ignoring diagonal elements. + * Fails in the presence of NAN, and warns and updates at INFs. + * d: Matrix/Vector to get maximum, and update any INFs + * nodiag: Whether to ignore diagonal values when searching for maximum. + * Return: 0 if NAN is found, or the non-INF maximum otherwise. + */ +FTYPE pij_llrtopij_llrmatmax(MATRIXF* d,char nodiag); +FTYPE pij_llrtopij_llrvecmax(VECTORF* d); +/* Convert LLR of real data to probabilities, when the distribution + * of LLR of null distribution can be calculated analytically to follow + * x=-0.5*log(1-z1/(z1+z2)), where z1~chi2(n1),z2~chi2(n2). + * The conversion is performed for each gene A, i.e. per row of d and dconv. + * This function is older than pij_llrtopij_convert_single so it is not parallel. + * Make it parallel before using. + * d: [nrow,nx] The data to use for calculation of conversion rule from LLR to pij. + * dconv: [nrow,nd] The data of LLR to actually convert to pij. Can be same with d. + * ans: [nrow,nd] The output location of converted pij from dconv. + * n1, + * n2: Parameters of null distribution. + * nodiag: Whether diagonal elements of d should be ignored when converting + * to probabilities. + * nodiagshift: Diangonal column shift for nodiag==1. + * For nodiagshift>0/<0, use upper/lower diagonal. + * Return: 0 on success. + */ +int pij_llrtopij_convert_single(const MATRIXF* d,const MATRIXF* dconv,MATRIXF* ans,size_t n1,size_t n2,char nodiag,long nodiagshift); +// Same with pij_llrtopij_convert_single, for d=dconv=ans. Saves memory. +int pij_llrtopij_convert_single_self(MATRIXF* d,size_t n1,size_t n2,char nodiag,long nodiagshift); diff --git a/pij/llrtopij_a.c b/pij/llrtopij_a.c deleted file mode 100644 index 9a7a698..0000000 --- a/pij/llrtopij_a.c +++ /dev/null @@ -1,411 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "../base/config.h" -#include -#include -#include -#include "../base/threading.h" -#include "../base/gsl/math.h" -#include "../base/gsl/histogram.h" -#include "../base/gsl/blas.h" -#include "../base/logger.h" -#include "../base/macros.h" -#include "../base/data_process.h" -#include "../base/histogram.h" -#include "nulldist.h" -#include "llrtopij_a.h" -#include "llrtopij.h" - -/* Construct one null histogram for a specific genotype value count. - * The function calculates the null density histogram for random variable: - * x=-0.5*log(1-z1/(z1+z2)), where z1~chi2(n1),z2~chi2(n2), - * Histogram bin count and width are automatically determined - * from real data count (nd). - * For bin range settings, see histogram_unequalbins_fromnullcdf. - * For null density histogram from pdf, see pij_nulldist_hist_pdf. - * dmax: Specifies the histogram bound as [0,dmax). - * nd: Count of real data to form real histograms. This is used to - * automatically decide number of bins and widths. - * n1, - * n2: Parameters of null distribution. - * Return: Constructed null distribution histograms with preset - * bin ranges and values as density. - */ -static gsl_histogram* pij_llrtopij_a_nullhist_single(double dmax,size_t nd,size_t n1,size_t n2) -{ -#define CLEANUP CLEANHIST(h) - struct pij_nulldist_pdfs_param param={n1,n2}; - size_t nbin; - gsl_histogram *h=0; - - assert(n1&&n2); - dmax*=(1+1E-6); - nbin=histogram_unequalbins_param_count(nd); - if(nbin<5) - ERRRETV(0,"Determined "PRINTFSIZET" bins constructed. Bin count too small.",nbin) - else if(nbin<10) - LOG(5,"Determined "PRINTFSIZET" bins, smaller than recommended minimum bin count (10).",nbin) - else - LOG(10,"Determined "PRINTFSIZET" bins.",nbin) - h=gsl_histogram_alloc(nbin); - if(!h) - ERRRETV(0,"Not enough memory.") - //Null density histogram - gsl_histogram_set_ranges_uniform(h,0,dmax); - //Set null histogram ranges - if(histogram_unequalbins_fromnullpdfs(nbin,h->range,pij_nulldist_pdfs,¶m)) - ERRRETV(0,"histogram_unequalbins_fromnullpdfs failed.") - //Calculate null density histogram - if(pij_nulldist_hist_pdf(h->range,nbin,h->bin,param.n1,param.n2,5)) - ERRRETV(0,"pij_nulldist_hist_pdf failed.") - return h; -#undef CLEANUP -} - -gsl_histogram** pij_llrtopij_a_nullhist(double dmax,size_t nv,size_t nd,long n1c,size_t n1d,long n2c,size_t n2d) -{ -#define CLEANUP if(h){for(i=0;i=2); - dmax*=(1+1E-6); - CALLOCSIZE(h,nv-1); - if(!h) - ERRRETV(0,"Not enough memory.") - nbin=histogram_unequalbins_param_count(nd); - if(nbin<5) - ERRRETV(0,"Determined "PRINTFSIZET" bins constructed. Bin count too small.",nbin) - else if(nbin<10) - LOG(5,"Determined "PRINTFSIZET" bins, smaller than recommended minimum bin count (10).",nbin) - else - LOG(10,"Determined "PRINTFSIZET" bins.",nbin) - ret=1; - for(i=0;irange,pij_nulldist_pdfs,¶m)) - ERRRETV(0,"histogram_unequalbins_fromnullpdfs failed.") - //Calculate null density histogram - if(pij_nulldist_hist_pdf(h[i]->range,nbin,h[i]->bin,param.n1,param.n2,5)) - ERRRETV(0,"pij_nulldist_hist_pdf failed.") - } - return h; -#undef CLEANUP -} - -int pij_llrtopij_a_convert_single(const MATRIXF* d,const MATRIXF* dconv,MATRIXF* ans,size_t n1,size_t n2,char nodiag,long nodiagshift) -{ -#define CLEANUP CLEANHIST(hreal)CLEANHIST(hc)\ - CLEANHIST(h)CLEANVECD(vb1)CLEANVECD(vb2) - size_t ng=d->size1; - long k; - size_t j,nbin; - gsl_histogram *h,*hreal,*hc; - VECTORD *vb1,*vb2; - - h=0; - hreal=hc=0; - vb1=0; - vb2=0; - //Validity checks - assert((dconv->size1==ng)&&(ans->size1==ng)&&(ans->size2==dconv->size2)); - - //Construct null density histograms - { - FTYPE dmin,dmax; - MATRIXFF(minmax)(d,&dmin,&dmax); - if((!(dmin>=0))||gsl_isnan(dmax)||gsl_isinf(dmax)) - ERRRET("Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") - h=pij_llrtopij_a_nullhist_single((double)dmax,d->size2,n1,n2); - if(!h) - ERRRET("pij_llrtopij_a_nullhist_single failed.") - nbin=h->n; - } - - //Prepare for real histogram - hreal=gsl_histogram_clone(h); - hc=gsl_histogram_alloc(hreal->n+2); - if(!(hreal&&hc)) - ERRRET("Not enough memory."); - if(pij_llrtopij_convert_histograms_make_buffs(nbin,&vb1,&vb2)) - ERRRET("pij_llrtopij_convert_histograms_make_buffs failed.") - - //Conversion - { - VECTORDF(view) vv1,vvreal; - VECTORFF(view) vva; - VECTORD *vwidth,*vnull; - vwidth=VECTORDF(alloc)(nbin); - vnull=VECTORDF(alloc)(nbin); - if(!(vwidth&&vnull)) - { - CLEANVECD(vwidth)CLEANVECD(vnull) - ERRRET("Not enough memory.") - } - vv1=VECTORDF(view_array)(h->range+1,nbin); - VECTORDF(memcpy)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h->range,nbin); - VECTORDF(sub)(vwidth,&vv1.vector); - vvreal=VECTORDF(view_array)(hreal->bin,nbin); - vv1=VECTORDF(view_array)(h->bin,nbin); - for(j=0;jrange,h->range,(nbin+1)*sizeof(*hreal->range)); - memset(hreal->bin,0,nbin*sizeof(*hreal->bin)); - //Construct real histogram - if(nodiag&&((long)j+nodiagshift>=0)&&((long)j+nodiagshift<(long)d->size2)) - { - for(k=(long)j+nodiagshift-1;k>=0;k--) - gsl_histogram_increment(hreal,MATRIXFF(get)(d,j,(size_t)k)); - for(k=(long)j+nodiagshift+1;k<(long)d->size2;k++) - gsl_histogram_increment(hreal,MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2-1)); - } - else - { - for(k=0;k<(long)d->size2;k++) - gsl_histogram_increment(hreal,MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2)); - } - //Convert to density histogram - VECTORDF(div)(&vvreal.vector,vwidth); - //Convert to probability central histogram - pij_llrtopij_convert_histograms_buffed(hreal,vnull,hc,vb1,vb2); - //Convert likelihoods to probabilities - vva=MATRIXFF(row)(ans,j); - pij_llrtopij_histogram_interpolate_linear(hc,&vvd.vector,&vva.vector); - } - CLEANVECD(vwidth)CLEANVECD(vnull) - } - CLEANUP - return 0; -#undef CLEANUP -} - -int pij_llrtopij_a_convert_single_self(MATRIXF* d,size_t n1,size_t n2,char nodiag,long nodiagshift) -{ -#define CLEANUP CLEANAMHIST(hreal,nth)CLEANAMHIST(hc,nth)\ - CLEANHIST(h)CLEANMATD(mb1)CLEANMATD(mb2)CLEANMATD(mnull)CLEANMATF(mb3)CLEANVECD(vwidth) - - size_t ng=d->size1; - size_t i,nbin; - gsl_histogram *h; - MATRIXD *mb1,*mb2,*mnull; - MATRIXF *mb3; - VECTORD *vwidth; - VECTORDF(view) vv1; - size_t nth; - - h=0; - mb1=mb2=mnull=0; - mb3=0; - vwidth=0; - //Validity checks - { - int nth0=omp_get_max_threads(); - assert(nth0>0); - nth=(size_t)nth0; - } - - - AUTOCALLOC(gsl_histogram*,hreal,nth,64) - AUTOCALLOC(gsl_histogram*,hc,nth,64) - if(!(hreal&&hc)) - ERRRET("Not enough memory."); - - //Construct null density histograms - { - FTYPE dmin,dmax; - if(nodiag) - MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,nodiagshift); - else - MATRIXFF(minmax)(d,&dmin,&dmax); - if((!(dmin>=0))||gsl_isnan(dmax)) - ERRRET("Negative or NAN found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps.") - if(gsl_isinf(dmax)) - { - LOG(5,"INF found in input data. It may invalidate follow up analysis. This may be due to incorrect previous steps or duplicate rows (by Spearman correlation).") - MATRIXFF(set_inf)(d,-1); - if(nodiag) - MATRIXFF(minmax_nodiag)(d,&dmin,&dmax,nodiagshift); - else - MATRIXFF(minmax)(d,&dmin,&dmax); - MATRIXFF(set_value)(d,-1,dmax); - } - h=pij_llrtopij_a_nullhist_single((double)dmax,d->size2,n1,n2); - if(!h) - ERRRET("pij_llrtopij_a_nullhist_single failed.") - nbin=h->n; - } - //Memory allocation - { - size_t n1,n2; - pij_llrtopij_convert_histograms_get_buff_sizes(nbin,&n1,&n2); - mb1=MATRIXDF(alloc)(nth,n1); - mb2=MATRIXDF(alloc)(nth,n2); - mnull=MATRIXDF(alloc)(nth,nbin); - mb3=MATRIXFF(alloc)(nth,d->size2); - vwidth=VECTORDF(alloc)(nbin); - if(!(mb1&&mb2&&mnull&&mb3&&vwidth)) - ERRRET("Not enough memory.") - } - - //Prepare for real histogram - { - int ret; - for(i=0,ret=1;in+2); - ret=ret&&hreal[i]&&hc[i]; - } - if(!ret) - ERRRET("Not enough memory."); - } - - //Conversion - vv1=VECTORDF(view_array)(h->range+1,nbin); - VECTORDF(memcpy)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h->range,nbin); - VECTORDF(sub)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h->bin,nbin); - #pragma omp parallel - { - size_t ng1,ng2,id; - size_t j; - long k; - VECTORDF(view) vvreal,vvnull,vvb1,vvb2; - VECTORFF(view) vvb3,vva; - - id=(size_t)omp_get_thread_num(); - vvreal=VECTORDF(view_array)(hreal[id]->bin,nbin); - vvnull=MATRIXDF(row)(mnull,id); - vvb1=MATRIXDF(row)(mb1,id); - vvb2=MATRIXDF(row)(mb2,id); - vvb3=MATRIXFF(row)(mb3,id); - - threading_get_startend(ng,&ng1,&ng2); - - for(j=ng1;jrange,h->range,(nbin+1)*sizeof(*hreal[id]->range)); - memset(hreal[id]->bin,0,nbin*sizeof(*hreal[id]->bin)); - //Construct real histogram - if(nodiag&&((long)j+nodiagshift>=0)&&((long)j+nodiagshift<(long)d->size2)) - { - for(k=(long)j+nodiagshift-1;k>=0;k--) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - for(k=(long)j+nodiagshift+1;k<(long)d->size2;k++) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2-1)); - } - else - { - for(k=0;k<(long)d->size2;k++) - gsl_histogram_increment(hreal[id],MATRIXFF(get)(d,j,(size_t)k)); - VECTORDF(scale)(&vvreal.vector,1./(double)(d->size2)); - } - - //Convert to density histogram - VECTORDF(div)(&vvreal.vector,vwidth); - //Convert to probability central histogram - pij_llrtopij_convert_histograms_buffed(hreal[id],&vvnull.vector,hc[id],&vvb1.vector,&vvb2.vector); - //Convert likelihoods to probabilities - vva=MATRIXFF(row)(d,j); - pij_llrtopij_histogram_interpolate_linear(hc[id],&vvb3.vector,&vva.vector); - } - } - - CLEANUP - return 0; -#undef CLEANUP -} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pij/llrtopij_a.h b/pij/llrtopij_a.h deleted file mode 100644 index 9de373c..0000000 --- a/pij/llrtopij_a.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This file contains the conversion from log likelihood ratio to probabilities, - * when gene pairs of the same A are converted together. - * - */ - -#ifndef _HEADER_LIB_PIJ_LLRTOPIJ_A_H_ -#define _HEADER_LIB_PIJ_LLRTOPIJ_A_H_ -#include "../base/config.h" -#include "../base/gsl/histogram.h" -#include "../base/types.h" -#ifdef __cplusplus -extern "C" -{ -#endif - -/* Construct multiple null histograms for different genotype value counts. - * The function calculates the null density histogram for random variable: - * x=-0.5*log(1-z1/(z1+z2)), where z1~chi2(i*n1c+n1d),z2~chi2(-i*n2c+n2d), - * i=0,...,nv-2. Histogram bin count and width are automatically determined - * from real data count (nd). - * For bin range settings, see histogram_unequalbins_fromnullcdf. - * For null density histogram from pdf, see pij_nulldist_hist_pdf. - * dmax: Specifies the histogram bound as [0,dmax). - * nv: Maximum number of values each genotype can type. Must be nv>=2. - * This limits the possible values of kv in distribution, and - * also output histogram count. - * nd: Count of real data to form real histograms. This is used to - * automatically decide number of bins and widths. - * n1c, - * n1d, - * n2c - * n2d: Parameters of null distribution. - * Return: [nv-1]. Constructed null distribution histograms with preset - * bin ranges and values as density. Genotypes with i values have - * histogram stored in Return[i-2]. - */ -gsl_histogram** pij_llrtopij_a_nullhist(double dmax,size_t nv,size_t nd,long n1c,size_t n1d,long n2c,size_t n2d); - -/* Convert LLR of real data to probabilities, when the distribution - * of LLR of null distribution can be calculated analytically to follow - * x=-0.5*log(1-z1/(z1+z2)), where z1~chi2(n1),z2~chi2(n2). - * The conversion is performed for each gene A, i.e. per row of d and dconv. - * This function is older than pij_llrtopij_a_convert_single so it is not parallel. - * Make it parallel before using. - * d: [nrow,nx] The data to use for calculation of conversion rule from LLR to pij. - * dconv: [nrow,nd] The data of LLR to actually convert to pij. Can be same with d. - * ans: [nrow,nd] The output location of converted pij from dconv. - * n1, - * n2: Parameters of null distribution. - * nodiag: Whether diagonal elements of d should be ignored when converting - * to probabilities. - * nodiagshift: Diangonal column shift for nodiag==1. - * For nodiagshift>0/<0, use upper/lower diagonal. - * Return: 0 on success. - */ -int pij_llrtopij_a_convert_single(const MATRIXF* d,const MATRIXF* dconv,MATRIXF* ans,size_t n1,size_t n2,char nodiag,long nodiagshift); - -// Same with pij_llrtopij_a_convert_single, for d=dconv=ans. Saves memory. -int pij_llrtopij_a_convert_single_self(MATRIXF* d,size_t n1,size_t n2,char nodiag,long nodiagshift); - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/llrtopij_tot.c b/pij/llrtopij_tot.c deleted file mode 100644 index 1364ca9..0000000 --- a/pij/llrtopij_tot.c +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "../base/config.h" -#include -#include -#include -#include -#include -#include "../base/gsl/math.h" -#include "../base/gsl/histogram.h" -#include "../base/gsl/blas.h" -#include "../base/logger.h" -#include "../base/macros.h" -#include "../base/data_process.h" -#include "../base/threading.h" -#include "../base/histogram.h" -#include "nullhist.h" -#include "llrtopij_tot.h" -#include "llrtopij.h" -#pragma GCC diagnostic ignored "-Wunused-parameter" - -int pij_llrtopij_tot_convert(const VECTORF* d,const VECTORF* dconv,VECTORF* ans,gsl_histogram* (*fh)(const VECTORF*,const void*),int (*fnh)(const void*,gsl_histogram*),const void* ph,const void* pnh,double* nullratio) -{ -#define CLEANUP CLEANHIST(hist)CLEANHIST(histc)AUTOFREE(histn)AUTOFREEVEC(vbuff) - - gsl_histogram *hist,*histc; -// double* histn; Normal histogram of null density - double* tp; - VECTORDF(view) vv; - size_t i; - int ret; - - hist=fh(d,ph); - if(!hist) - { - LOG(1,"Histogram bin range construction failed.") - return 1; - } - AUTOALLOC(double,histn,hist->n,1000) - histc=gsl_histogram_alloc(hist->n+2); - AUTOALLOCVECD(vbuff,hist->n,1000) - if(!(histn&&histc&&vbuff)) - ERRRET("Not enough memory.") - //Real histogram - for(i=0;isize;i++) - gsl_histogram_increment(hist,(double)VECTORFF(get)(d,i)); - - //Null histogram - tp=histn; - histn=hist->bin; - hist->bin=tp; - ret=fnh(pnh,hist); - tp=histn; - histn=hist->bin; - hist->bin=tp; - if(ret) - ERRRET("Custom null histogram function failed.") - - //Calculate ratio of true distribution - { - VECTORDF(view) vvnull=VECTORDF(view_array)(histn,hist->n); - //convert to density histogram - vv=VECTORDF(view_array)(hist->range+1,hist->n); - VECTORDF(memcpy)(vbuff,&vv.vector); - vv=VECTORDF(view_array)(hist->range,hist->n); - VECTORDF(sub)(vbuff,&vv.vector); - VECTORDF(div)(&vvnull.vector,vbuff); - vv=VECTORDF(view_array)(hist->bin,hist->n); - VECTORDF(scale)(&vv.vector,1./(double)d->size); - VECTORDF(div)(&vv.vector,vbuff); - if(pij_llrtopij_convert_histograms(hist,&vvnull.vector,histc)) - ERRRET("Failed in pij_llrtopij_convert_histograms.") - } - //Convert likelihoods to probabilities - pij_llrtopij_histogram_interpolate_linear(histc,dconv,ans); - - if(nullratio) - { - *nullratio=1-BLASF(asum)(ans)/(FTYPE)ans->size; - LOG(9,"Null distribution ratio: %G",*nullratio) - } - CLEANUP - return 0; -#undef CLEANUP -} - - - - - - - - - - - - - - - - - - - - - diff --git a/pij/llrtopij_tot.h b/pij/llrtopij_tot.h deleted file mode 100644 index 5c32cf0..0000000 --- a/pij/llrtopij_tot.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This file contains the conversion from log likelihood ratio to probabilities, - * when all gene pairs are considered together. - * - */ - -#ifndef _HEADER_LIB_PIJ_LLRTOPIJ_TOT_H_ -#define _HEADER_LIB_PIJ_LLRTOPIJ_TOT_H_ -#include "../base/config.h" -#include "../base/gsl/histogram.h" -#include "../base/types.h" -#ifdef __cplusplus -extern "C" -{ -#endif - -/* Converts log likelihood ratio data into probabilities based on the reference null log likelihood ratios. - * The approach assumes the real data is the outcome of a mixture of alternative and null hypotheses. - * The approach first simulates null log likelihood ratios from permuted the real data, and then - * draw histograms of log likelihood ratios for both real and only null situations. - * Assuming on one tail of the histogram, the real data should contain mostly (if not only) null hypothesis. - * The null fraction can be known at the tail, and therefore at every bin based on the simulated - * null histogram shape. The final probability is defined to be the ratio of alternative hypothesis versus - * total, only on that bin. - * NOTE: Real and simulated data should both be non-negative. Real data should reach true probability->0 - * at the limit of log likelihood ratio ->0+. - * d: Real log likelihood ratios, to be compared against permuted (null) data. Real data is assumed to contain - * both null and alternative hypotheses. - * dconv: Real log likelihood ratios to be converted to probabilities, based on the comparison between d and null distribution. - * Same with d in most cases. - * ans: Output vector for the probabilities. Have same size with dconv, and each provides the estimated probability - * of the corresponding entry of dconv. - * fh: Function to obtain real histogram from data d. It should automatically determine bin widths. Takes parameters: - * Param 1: Data points. (i.e. d). - * Param 2: Parameter of the function to be specified in a later parameter, and passed on by pij_llrtopij_convert. - * Return: Constructed gsl_histogram if success, or 0 if fail. - * fnh: Function to obtain null histogram. Can be simulation based or analytical. They are defined in pij_gassist/nullhist.h. - * Param 1: Parameter to pass on to the function and defined later in the parameters. - * Param 2: Existing empty histogram to fill for the function. - * Param 3: Number of parallel threads. - * Return: 0 if success. - * ph: Parameter (2) to be passed onto fh. - * pnh: Parameter (1) to be passed onto fnh. - * nullratio: if not NULL, return the ratio of null data in real data into nullratio. - * Return: 0 on success. - */ -int pij_llrtopij_tot_convert(const VECTORF* d,const VECTORF* dconv,VECTORF* ans,gsl_histogram* (*fh)(const VECTORF*,const void*),int (*fnh)(const void*,gsl_histogram*),const void* ph,const void* pnh,double* nullratio); - - - - - - - - - - - - - - - - - - - - - - - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/llrtopv.c b/pij/llrtopv.c index 4c1ce77..af8fa09 100644 --- a/pij/llrtopv.c +++ b/pij/llrtopv.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/pij/llrtopv.h b/pij/llrtopv.h index 931c183..b91df3d 100644 --- a/pij/llrtopv.h +++ b/pij/llrtopv.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/pij/nulldist.c b/pij/nulldist.c index 3209f99..92dc3e4 100644 --- a/pij/nulldist.c +++ b/pij/nulldist.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/pij/nulldist.h b/pij/nulldist.h index 5147bca..6a57a2b 100644 --- a/pij/nulldist.h +++ b/pij/nulldist.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * diff --git a/pij/nullhist.c b/pij/nullhist.c index a73fc86..796dc8e 100644 --- a/pij/nullhist.c +++ b/pij/nullhist.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -18,177 +18,98 @@ #include "../base/config.h" #include #include -#include -#include -#include "../base/gsl/blas.h" +#include +#include "../base/gsl/histogram.h" #include "../base/logger.h" #include "../base/macros.h" #include "../base/histogram.h" -#include "../base/threading.h" -#include "nullhist.h" #include "nulldist.h" +#include "nullhist.h" -/* Null histogram generation of sample-model method on single thread. - * g,t,t2,sampler,modeler,ps: See pij_nullhist_sample_model_param. - * pmc: Model container object. See definition in nullmodel.h. - * Return: 0 if success. - */ -static int pij_nullhist_sample_model_single(const void* data,const struct pij_nullsampler* sampler,const struct pij_nullmodeler* modeler,const void* ps,void* pmc) + + +gsl_histogram* pij_nullhist_single(double dmax,size_t nd,size_t n1,size_t n2) { -#define CLEANUP if(ptrs){sampler->close(ptrs);ptrs=0;}\ - if(pm){modeler->close(pm);pm=0;} - - struct timeval tv; - void *ptrs,*pm; - size_t n1,n2; - size_t i; +#define CLEANUP CLEANHIST(h) + struct pij_nulldist_pdfs_param param={n1,n2}; + size_t nbin; + gsl_histogram *h=0; - assert(sampler&&modeler); - ptrs=pm=0; - //Initialize sampler with timely random seed. - gettimeofday(&tv,NULL); - ptrs=sampler->init(data,ps,(long unsigned int)tv.tv_usec); - if(!ptrs) - ERRRET("Sampler construction failed.") - //Obtain dimensions of random samples. - sampler->dimensions(ptrs,&n1,&n2); assert(n1&&n2); - - AUTOALLOCVECF(vd,n1,30000) - if(!vd) - ERRRET("Not enough memory.") - //Initialize modeler object given model container object. - pm=modeler->init(pmc,data,n1,n2); - if(!pm) - { - AUTOFREEVEC(vd) - ERRRET("Modeler construction failed.") - } - //Sample - for(i=0;isample(ptrs,vd); - modeler->input(pm,vd); - } - //Synced updated of model container. - #pragma omp critical - modeler->merge(pm,pmc); - CLEANUP - AUTOFREEVEC(vd) - return 0; + dmax*=(1+1E-6); + nbin=histogram_unequalbins_param_count(nd); + if(nbin<5) + ERRRETV(0,"Determined "PRINTFSIZET" bins constructed. Bin count too small.",nbin) + else if(nbin<10) + LOG(5,"Determined "PRINTFSIZET" bins, smaller than recommended minimum bin count (10).",nbin) + else + LOG(10,"Determined "PRINTFSIZET" bins.",nbin) + h=gsl_histogram_alloc(nbin); + if(!h) + ERRRETV(0,"Not enough memory.") + //Null density histogram + gsl_histogram_set_ranges_uniform(h,0,dmax); + //Set null histogram ranges + if(histogram_unequalbins_fromnullpdfs(nbin,h->range,pij_nulldist_pdfs,¶m)) + ERRRETV(0,"histogram_unequalbins_fromnullpdfs failed.") + //Calculate null density histogram + if(pij_nulldist_hist_pdf(h->range,nbin,h->bin,param.n1,param.n2,5)) + ERRRETV(0,"pij_nulldist_hist_pdf failed.") + return h; #undef CLEANUP } -int pij_nullhist_sample_model(const void* param,gsl_histogram* h) +gsl_histogram** pij_nullhist(double dmax,size_t nv,size_t nd,long n1c,size_t n1d,long n2c,size_t n2d) { -#define CLEANUP if(container){\ - p->modeler->close_container(container);\ - container=0;} - const struct pij_nullhist_sample_model_param *p=param; +#define CLEANUP if(h){for(i=0;imodeler->init_container(p->d,p->pm,h); - if(!container) - ERRRET("Modeler container construction failed.") - - #pragma omp parallel shared(ret) + assert(nv>=2); + dmax*=(1+1E-6); + CALLOCSIZE(h,nv-1); + if(!h) + ERRRETV(0,"Not enough memory.") + nbin=histogram_unequalbins_param_count(nd); + if(nbin<5) + ERRRETV(0,"Determined "PRINTFSIZET" bins constructed. Bin count too small.",nbin) + else if(nbin<10) + LOG(5,"Determined "PRINTFSIZET" bins, smaller than recommended minimum bin count (10).",nbin) + else + LOG(10,"Determined "PRINTFSIZET" bins.",nbin) + ret=1; + for(i=0;idsize]; - //Initializing block selection - if(p->partition(p->d,d,(size_t)omp_get_thread_num(),(size_t)omp_get_num_threads())) - { - int retth; - //Calculate null histograms - retth=pij_nullhist_sample_model_single(d,p->sampler,p->modeler,p->ps,container); - #pragma omp atomic - ret+=retth; - } + gsl_histogram_set_ranges_uniform(h[i],0,dmax); + //Set null histogram ranges + param.n1=(size_t)((long)i*n1c+(long)n1d); + param.n2=(size_t)(-(long)i*n2c+(long)n2d); + if(histogram_unequalbins_fromnullpdfs(nbin,h[i]->range,pij_nulldist_pdfs,¶m)) + ERRRETV(0,"histogram_unequalbins_fromnullpdfs failed.") + //Calculate null density histogram + if(pij_nulldist_hist_pdf(h[i]->range,nbin,h[i]->bin,param.n1,param.n2,5)) + ERRRETV(0,"pij_nulldist_hist_pdf failed.") } - if(ret) - ERRRET("Failed to construct histogram.") - - //Output from model container - ret=p->modeler->output(container,h); - if(ret) - ERRRET("Failed to output histogram.") - CLEANUP - return 0; + return h; #undef CLEANUP } -int pij_nullhist_analytical_pdf(const void* param,gsl_histogram* h) -{ -#define CLEANUP CLEANVECD(loc)CLEANVECD(val) - const struct pij_nullhist_analytical_pdf_param *p=param; - - VECTORD *loc,*val; - VECTORDF(view) vvh=VECTORDF(view_array)(h->bin,h->n); - size_t i; - size_t nsp; - double v; - - assert((p->nsplit<10)); - nsp=(size_t)1<<(p->nsplit); - loc=VECTORDF(alloc)(h->n*nsp); - val=VECTORDF(alloc)(h->n*nsp); - if(!(loc&&val)) - ERRRET("Not enough memory.") - - //Construct bin ranges - if(nsp==1) - { - VECTORDF(const_view) vv1=VECTORDF(const_view_array)(h->range,h->n); - VECTORDF(const_view) vv2=VECTORDF(const_view_array)(h->range+1,h->n); - VECTORDF(memcpy)(loc,&vv1.vector); - VECTORDF(add)(loc,&vv2.vector); - VECTORDF(scale)(loc,0.5); - } - else - { - VECTORDF(const_view) vvc=VECTORDF(const_view_array)(h->range,h->n+1); - histogram_finer_central(&vvc.vector,loc,nsp); - } - //Calculate bin densities - if(p->func(loc,val,p->param)) - { - CLEANUP - return 1; - } - - //Shrink to output - if(nsp==1) - { - VECTORDF(memcpy)(&vvh.vector,val); - } - else - { - VECTORDF(set_zero)(&vvh.vector); - for(i=0;in); - VECTORDF(add)(&vvh.vector,&vvc.vector); - } - } - //Convert to bin values from densities - { - VECTORDF(const_view) vv1=VECTORDF(const_view_array)(h->range,h->n); - VECTORDF(const_view) vv2=VECTORDF(const_view_array)(h->range+1,h->n); - VECTORDF(view) vv3=VECTORDF(subvector)(loc,0,h->n); - VECTORDF(memcpy)(&vv3.vector,&vv2.vector); - VECTORDF(sub)(&vv3.vector,&vv1.vector); - VECTORDF(mul)(&vvh.vector,&vv3.vector); - } - //Scale to unity. - v=gsl_blas_dasum(&vvh.vector); - VECTORDF(scale)(&vvh.vector,1/v); - CLEANUP - return 0; -#undef CLEANUP -} + + + + + + + + diff --git a/pij/nullhist.h b/pij/nullhist.h index 509aab4..ba18906 100644 --- a/pij/nullhist.h +++ b/pij/nullhist.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -15,122 +15,59 @@ * You should have received a copy of the GNU Affero General Public License * along with Findr. If not, see . */ -/* This file contains the methods to construct null histogram. - * Each method contains two parts: an interface function to produce - * the null histogram and a struct for the parameters needed by the - * function. - * Currently two methods are implemented. - * 1: Sample-model method utilizes sampler functions - * to randomly sample null data and modeler functions to model the - * null histogram based on sampled null data. This is a generic - * method and can be tailored for specific needs. - * 2: Analytical method specifically for pij model, where the distribution - * of LLR of null hypothesis is exactly calculable. This is - * the actually used method for faster and more precise performance. - * - * Regardless of method, the interface function should conform with - * the following definition: - * (*int)(const void* p,gsl_histogram* h); - * p: Parameter of the method. - * h: Existing histogram with range specified. The function should - * fill this histogram in the bins. - * Return: 0 on success. +/* This part produces the histogram of parametric null distributions. */ #ifndef _HEADER_LIB_PIJ_NULLHIST_H_ #define _HEADER_LIB_PIJ_NULLHIST_H_ #include "../base/config.h" -#include "../base/gsl/histogram.h" #include "../base/types.h" -#include "nullsampler.h" -#include "nullmodeler.h" +#include "../base/gsl/histogram.h" #ifdef __cplusplus extern "C" { #endif -/**************************************************************** - * The sample-model method for generation of null histogram - * Uses a sampler function to randomly sample null data and a modeler - * to model the histogram of null data. The samplers are defined - * in pij/nullsampler.h. The modelers are defined in - * pij/nullmodeler.h. The sample-model method is generic - * but for the specific pij model, analytical method (see below) - * perform more precise and faster. - ***************************************************************/ - -struct pij_nullhist_sample_model_param -{ - //Input data - const void *d; - /* Data struct size (for d). - * Must be small and use pointers, for memory saving and easier partitioning. - */ - size_t dsize; - /* Partition function to partition mission/data into smaller chunks. - * Look threading_threading_get_startend for help - * src: Larger data to be partitioned, having same type as d - * dest: Smaller data after partitioning, same type as d - * id: Id of the data chuck (out of total) to obtain from partition - * total: Total number of chucks for partitioning. - * Return: 'Size' of dest after partition. Must be !=0 for nonzero data size - * which needs further process, or 0 if no calculation is required - * due to zero size in the partition. - */ - size_t (*partition)(const void* src,void* dest,size_t id,size_t total); - //Sampler function struct - const struct pij_nullsampler* sampler; - //Modeler function struct - const struct pij_nullmodeler* modeler; - //Parameter of sampler function - const void* ps; - //Parameter of modeler function - const void* pm; -}; - -// Interface function of sample-model method, multithreaded. -int pij_nullhist_sample_model(const void* p,gsl_histogram* h); - - -/**************************************************************** - * The analytical methods to calculate null histograms for pij - * is faster and more precise. This method calculates pdf values at - * evenly spreaded points within each bin for better accuracy. - * The three steps of pij share the same analytical formula of - * null histograms. - ***************************************************************/ - - -struct pij_nullhist_analytical_pdf_param -{ - /* Number of split within each bin. 2^nsplit central points are - * taken and the mean is calculated as the average pdf within the bin. - */ - size_t nsplit; - /* Null pdf function to draw null histogram from - * Param 1: Coordinates to calculate pdfs - * Param 2: Output buffer for pdfs as return - * Param 3: Parameter accepted by function - * Return: 0 on success. - */ - int (*func)(const VECTORD*,VECTORD*,const void*); - //Parameter accepted by pdf function (Param 3) - const void* param; -}; - -/* Generic interface function of analytical method. - * Same as interface function except with one extra parameter in the end. - * func: to specifiy which function to use to calculate null pdf. - */ -int pij_nullhist_analytical_pdf(const void* param,gsl_histogram* h); - - - - - - - +/* Construct one null histogram for a specific genotype value count. + * The function calculates the null density histogram for random variable: + * x=-0.5*log(1-z1/(z1+z2)), where z1~chi2(n1),z2~chi2(n2), + * Histogram bin count and width are automatically determined + * from real data count (nd). + * For bin range settings, see histogram_unequalbins_fromnullcdf. + * For null density histogram from pdf, see pij_nulldist_hist_pdf. + * dmax: Specifies the histogram bound as [0,dmax). + * nd: Count of real data to form real histograms. This is used to + * automatically decide number of bins and widths. + * n1, + * n2: Parameters of null distribution. + * Return: Constructed null distribution histograms with preset + * bin ranges and values as density. + */ +gsl_histogram* pij_nullhist_single(double dmax,size_t nd,size_t n1,size_t n2); + +/* Construct multiple null histograms for different genotype value counts. + * The function calculates the null density histogram for random variable: + * x=-0.5*log(1-z1/(z1+z2)), where z1~chi2(i*n1c+n1d),z2~chi2(-i*n2c+n2d), + * i=0,...,nv-2. Histogram bin count and width are automatically determined + * from real data count (nd). + * For bin range settings, see histogram_unequalbins_fromnullcdf. + * For null density histogram from pdf, see pij_nulldist_hist_pdf. + * dmax: Specifies the histogram bound as [0,dmax). + * nv: Maximum number of values each genotype can type. Must be nv>=2. + * This limits the possible values of kv in distribution, and + * also output histogram count. + * nd: Count of real data to form real histograms. This is used to + * automatically decide number of bins and widths. + * n1c, + * n1d, + * n2c + * n2d: Parameters of null distribution. + * Return: [nv-1]. Constructed null distribution histograms with preset + * bin ranges and values as density. Genotypes with i values have + * histogram stored in Return[i-2]. + */ +gsl_histogram** pij_nullhist(double dmax,size_t nv,size_t nd,long n1c,size_t n1d,long n2c,size_t n2d); diff --git a/pij/nullmodeler.c b/pij/nullmodeler.c deleted file mode 100644 index 987457f..0000000 --- a/pij/nullmodeler.c +++ /dev/null @@ -1,220 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -#include "../base/config.h" -#include -#include -#include -#include -#include -#include "../base/gsl/blas.h" -#include "../base/logger.h" -#include "../base/macros.h" -#include "nullmodeler.h" -#pragma GCC diagnostic ignored "-Wunused-parameter" - -static void* pij_nullmodeler_exp_init(const void* c,const void* d,size_t n1,size_t n2); - -static void pij_nullmodeler_exp_close(void* m); - -static void* pij_nullmodeler_exp_init_container(const void* d,const void* pm,const gsl_histogram* h) -{ - //Model container same with modeler - return pij_nullmodeler_exp_init(0,d,0,0); -} - -static int pij_nullmodeler_exp_output(const void* c,gsl_histogram* h) -{ -#define CLEANUP AUTOFREEVEC(vwidth) - const struct pij_nullmodeler_exp_state* p=c; - double v; - size_t i; - AUTOALLOCVECD(vwidth,h->n,1000) - VECTORDF(view) vv1,vv2; - - if(!vwidth) - ERRRET("Not enough memory.") - - //Calculate central exponential values as density. - vv1=VECTORDF(view_array)(h->range,h->n); - vv2=VECTORDF(view_array)(h->bin,h->n); - VECTORDF(memcpy)(&vv2.vector,&vv1.vector); - vv1=VECTORDF(view_array)(h->range+1,h->n); - VECTORDF(add)(&vv2.vector,&vv1.vector); - VECTORDF(scale)(&vv2.vector,-(double)(p->n)/(2*p->v)); - for(i=0;in;i++) - h->bin[i]=exp(h->bin[i]); - //Bin=density*width - VECTORDF(memcpy)(vwidth,&vv1.vector); - vv1=VECTORDF(view_array)(h->range,h->n); - VECTORDF(sub)(vwidth,&vv1.vector); - VECTORDF(mul)(&vv2.vector,vwidth); - //Normalize to total 1. - v=gsl_blas_dasum(&vv2.vector); - VECTORDF(scale)(&vv2.vector,1/v); - CLEANUP - return 0; -#undef CLEANUP -} - -static void pij_nullmodeler_exp_close_container(void* c) -{ - pij_nullmodeler_exp_close(c); -} - -static void* pij_nullmodeler_exp_init(const void* c,const void* d,size_t n1,size_t n2) -{ - struct pij_nullmodeler_exp_state* p; - MALLOCSIZE(p,1); - if(!p) - return 0; - p->n=0; - p->v=0; - return p; -} -static void pij_nullmodeler_exp_input(void* m,const VECTORF* data) -{ - struct pij_nullmodeler_exp_state* p=m; - p->v+=BLASF(asum)(data); - p->n+=data->size; -} - -static void pij_nullmodeler_exp_merge(const void* m,void* c) -{ - const struct pij_nullmodeler_exp_state* pm=m; - struct pij_nullmodeler_exp_state* pc=c; - pc->n+=pm->n; - pc->v+=pm->v; -} - -static void pij_nullmodeler_exp_close(void* m) -{ - if(m) - free(m); -} - -const struct pij_nullmodeler pij_nullmodeler_exp={ - pij_nullmodeler_exp_init_container, - pij_nullmodeler_exp_output, - pij_nullmodeler_exp_close_container, - pij_nullmodeler_exp_init, - pij_nullmodeler_exp_input, - pij_nullmodeler_exp_merge, - pij_nullmodeler_exp_close -}; - - - - - - - -static void* pij_nullmodeler_naive_init_container(const void* d,const void* pm,const gsl_histogram* h) -{ -#define CLEANUP if(p){CLEANHIST(p->h)free(p);p=0;} - struct pij_nullmodeler_naive_state *p; - CALLOCSIZE(p,1); - if(!p) - ERRRETV(0,"Not enough memory.") - p->h=gsl_histogram_alloc(h->n); - if(!p->h) - ERRRETV(0,"Not enough memory.") - //Initialize histogram with same ranges. - gsl_histogram_set_ranges(p->h,h->range,h->n+1); - return p; -#undef CLEANUP -} - -static int pij_nullmodeler_naive_output(const void* c,gsl_histogram* h) -{ - const struct pij_nullmodeler_naive_state* p=c; - VECTORDF(view) vv=VECTORDF(view_array)(h->bin,h->n); - - memcpy(h->bin,p->h->bin,h->n*sizeof(*h->bin)); - //Scale to unit total probability. - VECTORDF(scale)(&vv.vector,1/gsl_blas_dasum(&vv.vector)); - return 0; -} - -static void pij_nullmodeler_naive_close_container(void* c) -{ - struct pij_nullmodeler_naive_state* p=c; - CLEANHIST(p->h) - free(p); -} - -static void* pij_nullmodeler_naive_init(const void* c,const void* d,size_t n1,size_t n2) -{ - const struct pij_nullmodeler_naive_state* p=c; - return pij_nullmodeler_naive_init_container(d,0,p->h); -} - -static void pij_nullmodeler_naive_input(void* m,const VECTORF* data) -{ - struct pij_nullmodeler_naive_state* p=m; - size_t i; - for(i=0;isize;i++) - gsl_histogram_increment(p->h,VECTORFF(get)(data,i)); -} - -static void pij_nullmodeler_naive_merge(const void* m,void* c) -{ - const struct pij_nullmodeler_naive_state* pm=m; - struct pij_nullmodeler_naive_state* pc=c; - gsl_histogram_add(pc->h,pm->h); -} - -static void pij_nullmodeler_naive_close(void* m) -{ - pij_nullmodeler_naive_close_container(m); -} - -const struct pij_nullmodeler pij_nullmodeler_naive={ - pij_nullmodeler_naive_init_container, - pij_nullmodeler_naive_output, - pij_nullmodeler_naive_close_container, - pij_nullmodeler_naive_init, - pij_nullmodeler_naive_input, - pij_nullmodeler_naive_merge, - pij_nullmodeler_naive_close -}; - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pij/nullmodeler.h b/pij/nullmodeler.h deleted file mode 100644 index 0fd04e2..0000000 --- a/pij/nullmodeler.h +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This file contains the modelers of LLR histograms of null hypothesis. - * Modelers are used by sample-model method to produce null histogram. - * (See pij_nullhist_sample_model.) - * - * Each null modeler is consisted of seven functions and a parameter struct. - * For function formats, see struct pij_nullmodeler. - * The nullhist function will first invoke 'init_container' method to obtain - * the unique model container object. Then for each thread, one modeler object - * is constructed as a child of the container, via 'init' method. Each thread - * holds a separate modeler object and use 'input' method to feed in data. - * Upon finishing, every thread merges modeler into the model container, with - * 'merge'. Merge takes place for one thread at a time, guaranteed by nullhist. - * The thread will 'close' modeler before exit. When all child threads completes, - * the master thread will apply 'output' method on the model container to obtain - * null histogram, and then use 'close_container' method to free up container. - * - * Two modelers are implemented: - * 1: Exponential modeler assumes LLR satisfies an exponential distribution. - * 2: Naive modeler assumes the true LLR histogram is in exact agreement with - * the sampled histogram. - */ - -#ifndef _HEADER_LIB_PIJ_NULLMODELER_H_ -#define _HEADER_LIB_PIJ_NULLMODELER_H_ -#include "../base/config.h" -#include "../base/gsl/histogram.h" -#include "../base/types.h" -#ifdef __cplusplus -extern "C" -{ -#endif - - -struct pij_nullmodeler -{ - /* Container initialization function. Should be invoked before - * any other operation. - * Function should perform parameter storage and memory allocations here. - * Somer modelers may use only part of the parameters. - * Param 1: Input data - * Param 2: Modeler parameter - * Param 3: Histogram as output. - * Return: Modeler container parameter struct pointer, or 0 if failed. - */ - void* (*init_container)(const void*,const void*,const gsl_histogram*); - /* Output modeler container to histogram. - * Histogram should sum to 1. - * Param 1: Modeler container parameter struct. - * Param 2: Output histogram. Bin ranges must be identical with the one - * referenced in init. - */ - int (*output)(const void*,gsl_histogram*); - /* Closes modeler container and frees memory. - * Param 1: Modeler container parameter struct. - */ - void (*close_container)(void*); - /* Initialization function. Should be invoked after container but - * before any other operation. - * Function should perform parameter storage and memory allocations here. - * Somer modelers may use only part of the parameters. - * Param 1: Output model container for reference. - * Param 2: Input data - * Param 3: n1. Sample size to be provided each time. - * Param 4: n2. Number of times samples will be provided. - * So n1*n2 is the total sample count. - * Return: Modeler parameter struct pointer, or 0 if failed. - */ - void* (*init)(const void*,const void*,size_t,size_t); - /* Input samples into modeler. Each sample should count 1. - * Param 1: Modeler parameter struct. - * Param 2: Existing samples. - */ - void (*input)(void*,const VECTORF*); - /* Perform final calculations and merge modeler into modeler container. - * Function does not need to guarantee threadsafe. - * Param 1: Modeler struct. - * Param 2: Modeler container. - */ - void (*merge)(const void*,void*); - /* Close modeler and frees memory. - * Param 1: Modeler struct. - */ - void (*close)(void*); - -}; - -/************************************************************************* - * Exponential modeler believes null data yields exponential distribution. - * This requries all data to be non-negative. - * The modeler seeks the best exponential coefficient via maximization of - * likelihood. - ************************************************************************/ - -//Modeler state and also model container state -struct pij_nullmodeler_exp_state -{ - //Number of samples - size_t n; - //Sum of samples - FTYPE v; -}; - -extern const struct pij_nullmodeler pij_nullmodeler_exp; - - -/************************************************************************* - * Naive modeler assumes the pdf of falling into each bin is exactly proportional - * to the number of instances sampled to be in that bin. - * Therefore a simple accumulation of samples in existing bins would suffice. - ************************************************************************/ -struct pij_nullmodeler_naive_state -{ -//Histogram of sampled null data. - gsl_histogram* h; -}; - -extern const struct pij_nullmodeler pij_nullmodeler_naive; - - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/nullsampler.h b/pij/nullsampler.h deleted file mode 100644 index c1ad98e..0000000 --- a/pij/nullsampler.h +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright 2016, 2017 Lingfei Wang - * - * This file is part of Findr. - * - * Findr is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Findr is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with Findr. If not, see . - */ -/* This file contains the LLR sampler definition of null hypotheses. - * Samplers are used in combination with modelers for the estimation - * of histograms for the distribution of LLR of null hypothesis, as - * applied in sample-model method. (See pij_nullhist_sample_model.) - * - * Each null sampler is consisted of four functions and a parameter struct. - * For function formats, see struct pij_nullsampler. - * - * For multithreading, each thread holds a separate and independent sampler. - */ - -#ifndef _HEADER_LIB_PIJ_NULLSAMPLER_H_ -#define _HEADER_LIB_PIJ_NULLSAMPLER_H_ -#include "../base/config.h" -#include "../base/gsl/histogram.h" -#include "../base/types.h" -#include "../base/random.h" -#ifdef __cplusplus -extern "C" -{ -#endif - - -struct pij_nullsampler -{ - /* Initialization function. Should be invoked before any other operation. - * Function should perform parameter storage and memory allocations here. - * Somer samplers may use only part of the parameters. - * Param 1: Input data - * Param 2: Sampler parameters - * Param 3: Initial random seed. - * Return: Sampler parameter struct pointer, or 0 if failed. - */ - void* (*init)(const void*,const void*,unsigned long); - /* Obtain dimensionality of samples. - * Param 1: Sampler parameter struct. - * Param 2: Return the return size of vector each time sampling function - * is invoked. - * Param 3: Return the number of cycles sampling function should be invoked. - */ - void (*dimensions)(const void*,size_t*,size_t*); - /* Produce samples by random with the stated amount by function dimensions. - * Param 1: Sampler parameter struct. - * Param 2: Existing buffer to write samples into. - */ - void (*sample)(void*,VECTORF*); - /* Close sampler and frees memory. - * Param 1: Sampler parameter struct. - */ - void (*close)(void*); -}; - -#ifdef __cplusplus -} -#endif -#endif diff --git a/pij/rank.c b/pij/rank.c index bf3865b..3892272 100644 --- a/pij/rank.c +++ b/pij/rank.c @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -28,7 +28,7 @@ #include "../base/data_process.h" #include "../base/supernormalize.h" #include "../base/threading.h" -#include "llrtopij_a.h" +#include "llrtopij.h" #include "llrtopv.h" #include "rank.h" @@ -145,7 +145,7 @@ int pij_rank_pv(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,size_t memlimit) #undef CLEANUP } -/* Convert LLR into probabilities per A. Uses pij_llrtopij_a_convert. +/* Convert LLR into probabilities per A. Uses pij_llrtopij_convert. * ans: (ng,nt) Source real LLRs to compare with null LLRs, * also output location of converted probabilities. * ns: Number of samples, used for calculation of null distribution @@ -155,7 +155,7 @@ int pij_rank_pv(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,size_t memlimit) * For nodiagshift>0/<0, use upper/lower diagonals of corresponding id. * Return: 0 if succeed. */ -static int pij_rank_llrtopij_a(MATRIXF* ans,size_t ns,char nodiag,long nodiagshift) +static int pij_rank_llrtopij(MATRIXF* ans,size_t ns,char nodiag,long nodiagshift) { LOG(9,"Converting LLR to probabilities on per A basis.") if(ns<=2) @@ -163,22 +163,10 @@ static int pij_rank_llrtopij_a(MATRIXF* ans,size_t ns,char nodiag,long nodiagshi LOG(0,"Needs at least 3 samples to compute probabilities.") return 1; } - return pij_llrtopij_a_convert_single_self(ans,1,ns-2,nodiag,nodiagshift); + return pij_llrtopij_convert_single_self(ans,1,ns-2,nodiag,nodiagshift); } -/* Calculate probabilities of A--B based on LLR distributions of real data - * and null hypothesis. - * t: (ng,ns) Expression data for A - * t2: (nt,ns) Expression data for B - * p: (ng,nt) Output for probabilities A--B is true - * nodiag: When the top ng rows of t2 is exactly t, diagonals of pij are meaningless. - * In this case, set nodiag to 1 to avoid inclusion of NANs. For nodiag=0, t and t2 - * should not have any identical genes. - * pij: Function to convert LLR to probabilities, such as pij_rank_llrtopij_a - * memlimit:Specifies approximate memory usage. Function can fail if memlimit is too small. For large dataset, memory usage will be reduced by spliting t into smaller chunks and infer separately. For unlimited memory, set memlimit=-1. - * Return: 0 if succeed. - */ -static int pij_rank_any(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodiag,int (*pij)(MATRIXF*,size_t,char,long),size_t memlimit) +int pij_rank(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodiag,size_t memlimit) { #define CLEANUP CLEANMATF(tnew)CLEANMATF(tnew2) MATRIXF *tnew,*tnew2; //(nt,ns) Supernormalized transcript matrix @@ -210,6 +198,13 @@ static int pij_rank_any(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodia if(!(tnew&&tnew2)) ERRRET("Not enough memory.") + //Check for identical rows in input data + { + VECTORFF(view) vbuff1=MATRIXFF(column)(tnew,0); + VECTORFF(view) vbuff2=MATRIXFF(row)(tnew2,0); + MATRIXFF(cmprow)(t,t2,&vbuff1.vector,&vbuff2.vector,nodiag,1); + } + //Step 1: Supernormalization LOG(9,"Supernormalizing...") MATRIXFF(memcpy)(tnew,t); @@ -228,7 +223,7 @@ static int pij_rank_any(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodia VECTORFF(set_zero)(&vv.vector); } //Step 3: Convert log likelihood ratios to probabilities - if((ret=pij(p,ns,nodiag,0))) + if((ret=pij_rank_llrtopij(p,ns,nodiag,0))) LOG(1,"Failed to convert log likelihood ratios to probabilities.") //Cleanup @@ -237,18 +232,6 @@ static int pij_rank_any(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodia #undef CLEANUP } -int pij_rank_a(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodiag,size_t memlimit) -{ - return pij_rank_any(t,t2,p,nodiag,pij_rank_llrtopij_a,memlimit); -} - -int pij_rank(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodiag,size_t memlimit) -{ - return pij_rank_a(t,t2,p,nodiag,memlimit); -} - - - diff --git a/pij/rank.h b/pij/rank.h index 2dd7368..d9716be 100644 --- a/pij/rank.h +++ b/pij/rank.h @@ -1,4 +1,4 @@ -/* Copyright 2016, 2017 Lingfei Wang +/* Copyright 2016-2018 Lingfei Wang * * This file is part of Findr. * @@ -45,11 +45,16 @@ extern "C" int pij_rank_pv(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,size_t memlimit); /* Calculate probabilities of A--B based on LLR distributions of real data - * and null hypothesis. Conversion from LLR to probability is per A. - * See pij_rank_pij_any. + * and null hypothesis. + * t: (ng,ns) Expression data for A + * t2: (nt,ns) Expression data for B + * p: (ng,nt) Output for probabilities A--B is true + * nodiag: When the top ng rows of t2 is exactly t, diagonals of pij are meaningless. + * In this case, set nodiag to 1 to avoid inclusion of NANs. For nodiag=0, t and t2 + * should not have any identical genes. + * memlimit:Specifies approximate memory usage. Function can fail if memlimit is too small. For large dataset, memory usage will be reduced by spliting t into smaller chunks and infer separately. For unlimited memory, set memlimit=-1. + * Return: 0 if succeed. */ -int pij_rank_a(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodiag,size_t memlimit); -//Duplicate with a different name int pij_rank(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodiag,size_t memlimit); @@ -66,7 +71,6 @@ int pij_rank(const MATRIXF* t,const MATRIXF* t2,MATRIXF* p,char nodiag,size_t me - #ifdef __cplusplus } #endif