Skip to content

Commit

Permalink
v1.4.1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
wym6912 committed Jul 12, 2023
1 parent 7b915c9 commit 21baffc
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 8 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@

Modified by wym6912 wym6912@outlook.com

## Updates (v1.4.1.2)

- Fix bug in outputting RNA sequences in `v1.4.1.1`
- Add `BLOSUM62` matrix arugment (`-B/--blosum`)

## Updates (v1.4.1.1)

- Add Windows compile support in `Visual Studio 2022`
Expand Down
Binary file modified abPOA.rc
Binary file not shown.
1 change: 1 addition & 0 deletions include/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ typedef struct {
int align_mode, gap_mode, max_n_cons;
double min_freq; // for multiploid data
int verbose; // to control output msg
int has_u; // detect RNA sequences

// char LogTable65536[65536];
// char bit_table16[65536];
Expand Down
7 changes: 5 additions & 2 deletions src/abpoa.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ char PROG[20] = "abpoa";
#define _bO BOLD UNDERLINE "O" NONE
#define _bA BOLD UNDERLINE "A" NONE
char DESCRIPTION[200] = _ba "daptive " _bb "anded " _bP "artial " _bO "rder " _bA "lignment";
char VERSION[20] = "1.4.1.1";
char VERSION[20] = "1.4.1.2";
char CONTACT[30] = "gaoy1@chop.edu";

const struct option abpoa_long_opt [] = {
Expand Down Expand Up @@ -55,6 +55,7 @@ const struct option abpoa_long_opt [] = {
{ "out-pog", 1, NULL, 'g' },
{ "max-num-cons", 1, NULL, 'd', },
{ "min-freq", 1, NULL, 'q', },
{ "blosum", 0, NULL, 'B', },

{ "help", 0, NULL, 'h' },
{ "version", 0, NULL, 'v' },
Expand Down Expand Up @@ -115,6 +116,7 @@ int abpoa_usage(void)
err_printf(" Input/Output:\n");
err_printf(" -Q --use-qual-weight take base quality score from FASTQ input file as graph edge weight [False]\n");
err_printf(" -c --amino-acid input sequences are amino acid (default is nucleotide) [False]\n");
err_printf(" -B --blosum use BLOSUM62 for protein alignment [False]\n");
err_printf(" -l --in-list input file is a list of sequence file names [False]\n");
err_printf(" each line is one sequence file containing a set of sequences\n");
err_printf(" which will be aligned by abPOA to generate a consensus sequence\n");
Expand Down Expand Up @@ -161,7 +163,7 @@ int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){

int main(int argc, char **argv) {
int c, m, in_list=0; char *s; abpoa_para_t *abpt = abpoa_init_para();
while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpso:r:g:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpsBo:r:g:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) {
switch(c)
{
case 'm': m = atoi(optarg);
Expand Down Expand Up @@ -203,6 +205,7 @@ int main(int argc, char **argv) {
else err_printf("Error: unknown output result mode: %s.\n", optarg);
break;
case 'g': abpt->out_pog= strdup(optarg); break;
case 'B': abpt->use_score_matrix = 2; break;

case 'd': abpt->max_n_cons = atoi(optarg); break;
case 'q': abpt->min_freq = atof(optarg); break;
Expand Down
1 change: 1 addition & 0 deletions src/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ typedef struct {
int align_mode, gap_mode, max_n_cons;
double min_freq; // for multiploid data
int verbose; // to control output msg
int has_u; // detect RNA sequences

// char LogTable65536[65536];
// char bit_table16[65536];
Expand Down
74 changes: 71 additions & 3 deletions src/abpoa_align.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ extern char ab_aa26_table[256];
extern char ab_aa256_table[256];
extern char ab_char26_table[256];
extern char ab_char256_table[256];
extern char ab_blosum62[][100];

void parse_mat_first_line(char *l, int *order) {
int i, n;
Expand Down Expand Up @@ -84,6 +85,40 @@ void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mat_fn) {
free(l); free(order); fclose(fp);
}

void abpoa_set_mat_from_var(abpoa_para_t *abpt, char** matrix, int abpt_sz)
{
char *this_line = matrix[0];
int first_line = 1;
if(abpt_sz != abpt->m)
{
err_func_printf(__func__, "Warning: matrix length %d is different with default setting (%d). Use default matrix", abpt->m, abpt_sz);
return;
}
int *order = (int*)_err_malloc(abpt_sz * sizeof(int));
for(; strlen(this_line) > 0; ++ this_line)
{
if(first_line)
{
first_line = 0;
// get string bases
parse_mat_first_line(this_line, order);
}
else
{
// get match/mismatch scores
parse_mat_score_line(this_line, order, abpt->m, abpt->mat);
}
}
int i; abpt->min_mis = 0, abpt->max_mat = 0;
for (i = 0; i < abpt->m * abpt->m; ++i) {
if (abpt->mat[i] > abpt->max_mat)
abpt->max_mat = abpt->mat[i];
if (-abpt->mat[i] > abpt->min_mis)
abpt->min_mis = -abpt->mat[i];
}
free(order);
}

void abpoa_set_gap_mode(abpoa_para_t *abpt) {
if (abpt->gap_open1 == 0) abpt->gap_mode = ABPOA_LINEAR_GAP;
else if (abpt->gap_open1 > 0 && abpt->gap_open2 == 0) abpt->gap_mode = ABPOA_AFFINE_GAP;
Expand Down Expand Up @@ -115,6 +150,7 @@ abpoa_para_t *abpoa_init_para(void) {
// number of residue types
abpt->m = 5; // nucleotide
abpt->mat = (int*)_err_malloc(abpt->m * abpt->m * sizeof(int));
abpt->has_u = 0; // DNA sequence

// score matrix
abpt->use_score_matrix = 0;
Expand Down Expand Up @@ -164,7 +200,14 @@ void abpoa_post_set_para(abpoa_para_t *abpt) {
}
}
if (abpt->use_score_matrix == 0) gen_simple_mat(abpt);
else abpoa_set_mat_from_file(abpt, abpt->mat_fn);
else if(abpt->use_score_matrix == 1) abpoa_set_mat_from_file(abpt, abpt->mat_fn);
else if(abpt->use_score_matrix == 2)
{
_err_simple_func_printf("Try to use BLOSUM62 matrix");
abpoa_set_mat_from_var(abpt, (char**)ab_blosum62, 26);

}
else err_fatal_core(__func__, "score matrix = %d, which is unsupported.\n", abpt->use_score_matrix);
}

void abpoa_free_para(abpoa_para_t *abpt) {
Expand Down Expand Up @@ -443,7 +486,7 @@ int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) {
abpoa_seq_t *abs = ab->abs; int exist_n_seq = abs->n_seq;

// read seq from read_fn
FILE* readfp = fopen(read_fn, "r"); kseq_t *ks = kseq_init(fileno(readfp));
FILE* readfp = xopen(read_fn, "r"); kseq_t *ks = kseq_init(fileno(readfp));
int i, j, n_seq = abpoa_read_seq(abs, ks);

// always reset graph before perform POA
Expand All @@ -452,21 +495,46 @@ int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) {
if (abs->seq[i].l > max_len) max_len = abs->seq[i].l;
}

// detect u and t
int has_st = 0, has_su = 0;

// set seqs, seq_lens
extern char ab_char26_table[256];
uint8_t **seqs = (uint8_t**)_err_malloc(n_seq * sizeof(uint8_t*)); int *seq_lens = (int*)_err_malloc(n_seq * sizeof(int));
int **weights = (int**)_err_malloc(n_seq * sizeof(int*));
char ch;
for (i = 0; i < n_seq; ++i) {
seq_lens[i] = abs->seq[exist_n_seq+i].l;
seqs[i] = (uint8_t*)_err_malloc(sizeof(uint8_t) * seq_lens[i]);
weights[i] = (int*)_err_malloc(sizeof(int) * seq_lens[i]);
for (j = 0; j < seq_lens[i]; ++j) seqs[i][j] = ab_char26_table[(int)abs->seq[exist_n_seq+i].s[j]];
for (j = 0; j < seq_lens[i]; ++j)
{
ch = abs->seq[exist_n_seq+i].s[j];
seqs[i][j] = ab_char26_table[(int)ch];
if(ch == 'U' || ch == 'u') ++ has_su;
else if(ch == 'T' || ch == 't') ++ has_st;
}
if (abpt->use_qv && abs->qual[exist_n_seq+i].l > 0) {
for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = (int)abs->qual[exist_n_seq+i].s[j]-32;
} else {
for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = 1;
}
}
if(abpt->m == 5) // nuc
{
if(has_st && has_su) err_fatal_core(__func__, "DNA sequence has U and T. Please check the sequences, use --amino-acid or -c if sequences are protein.\n");
else if(has_su)
{
_err_simple_func_printf("detected RNA sequences");
abpt->has_u = 1;
}
else
{
_err_simple_func_printf("detected DNA sequences");
abpt->has_u = 0;
}
}
else _err_simple_func_printf("detected Protein sequences\n");
if ((abpt->disable_seeding && abpt->progressive_poa==0) || abpt->align_mode != ABPOA_GLOBAL_MODE) {
abpoa_poa(ab, abpt, seqs, weights, seq_lens, exist_n_seq, n_seq);
} else {
Expand Down
10 changes: 7 additions & 3 deletions src/abpoa_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ abpoa_cons_t *abpoa_allocate_rc_msa(abpoa_cons_t *abc, int msa_len, int n_seq, i
void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
if (out_fp == NULL) return;
// print U in RNA
short changedU = (ab_char256_table['U'] != 'U'); char rawU = ab_char256_table['U'];
if(changedU) ab_char256_table['U'] = 'U';
if(abpt->has_u) ab_char256_table[3] = 'U';
int i, j;
abpoa_seq_t *abs = ab->abs; abpoa_cons_t *abc = ab->abc;
if (abc->msa_len <= 0) return;
Expand Down Expand Up @@ -101,7 +100,7 @@ void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
fprintf(out_fp, "\n");
}
}
ab_char256_table['U'] = rawU; // revert change
ab_char256_table[3] = 'T'; // revert change
}

void abpoa_set_msa_seq(abpoa_node_t node, int rank, uint8_t **msa_base) {
Expand Down Expand Up @@ -190,6 +189,8 @@ void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {

kdq_int_t *q = kdq_init_int();

if(abpt->has_u) ab_char256_table[3] = 'U';

// Breadth-First-Search
kdq_push_int(q, ABPOA_SRC_NODE_ID);
while ((id = kdq_shift_int(q)) != 0) {
Expand Down Expand Up @@ -269,6 +270,7 @@ void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
free(in_degree);
for (i = 0; i < n_seq; ++i) free(read_paths[i]);
free(read_paths); free(read_path_i);
ab_char256_table[3] = 'T';
}

int abpoa_cons_phred_score(int n_cov, int n_seq) {
Expand Down Expand Up @@ -498,6 +500,7 @@ void abpoa_multip_heaviest_bundling(abpoa_graph_t *abg, abpoa_para_t *abpt, int

void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
if (out_fp == NULL) return;
if(abpt->has_u) ab_char256_table[3] = 'U';
int cons_i, j;
abpoa_cons_t *abc = ab->abc;
for (cons_i = 0; cons_i < abc->n_cons; ++cons_i) {
Expand Down Expand Up @@ -529,6 +532,7 @@ void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
} fprintf(out_fp, "\n");
}
}
ab_char256_table[3] = 'T';
}

abpoa_cons_t *abpoa_allocate_cons(abpoa_cons_t *abc, int n_node, int n_seq, int n_cons) {
Expand Down
32 changes: 32 additions & 0 deletions src/abpoa_seq.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,38 @@ const char ab_aa256_table[256] = {
char ab_char26_table[256];
char ab_char256_table[256];

char ab_blosum62[][100] = {
{"A R N D C Q E G H I L K M F P S T W Y V B J Z X * O U"},
{"A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -1 -4 -4 -4"},
{"R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2 0 -1 -4 -4 -4"},
{"N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 4 -3 0 -1 -4 -4 -4"},
{"D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 -3 1 -1 -4 -4 -4"},
{"C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4 -4 -4"},
{"Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -2 4 -1 -4 -4 -4"},
{"E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 -3 4 -1 -4 -4 -4"},
{"G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -4 -2 -1 -4 -4 -4"},
{"H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -3 0 -1 -4 -4 -4"},
{"I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 3 -3 -1 -4 -4 -4"},
{"L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 3 -3 -1 -4 -4 -4"},
{"K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -3 1 -1 -4 -4 -4"},
{"M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 2 -1 -1 -4 -4 -4"},
{"F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 0 -3 -1 -4 -4 -4"},
{"P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4 -4 -4"},
{"S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -2 0 -1 -4 -4 -4"},
{"T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -1 -4 -4 -4"},
{"W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -2 -2 -1 -4 -4 -4"},
{"Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -1 -2 -1 -4 -4 -4"},
{"V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 2 -2 -1 -4 -4 -4"},
{"B -2 -1 4 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 -3 0 -1 -4 -4 -4"},
{"J -1 -2 -3 -3 -1 -2 -3 -4 -3 3 3 -3 2 0 -3 -2 -1 -2 -1 2 -3 3 -3 -1 -4 -4 -4"},
{"Z -1 0 0 1 -3 4 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -2 -2 -2 0 -3 4 -1 -4 -4 -4"},
{"X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4 -4 -4"},
{"* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 -4 -4"},
{"O -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 -4"},
{"U -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1"},
{""}
};

abpoa_seq_t *abpoa_init_seq(void) {
abpoa_seq_t *abs = (abpoa_seq_t*)_err_malloc(sizeof(abpoa_seq_t));
abs->n_seq = 0; abs->m_seq = CHUNK_READ_N;
Expand Down

0 comments on commit 21baffc

Please sign in to comment.