From a0f8204a6823ef1cb44d68553270791e753b0ae0 Mon Sep 17 00:00:00 2001 From: dytk2134 Date: Tue, 4 Sep 2018 16:20:08 -0400 Subject: [PATCH 1/2] add user-defined example files --- .appveyor.yml | 3 +++ .travis.yml | 3 +++ example_file/u1.txt | 3 +++ example_file/u2.txt | 2 ++ gff3tool/lib/gff3_merge/auto_replace_tag.py | 8 ++++++++ 5 files changed, 19 insertions(+) create mode 100644 example_file/u1.txt create mode 100644 example_file/u2.txt diff --git a/.appveyor.yml b/.appveyor.yml index 3f6be65..9bc68a6 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -30,6 +30,9 @@ test_script: - gff3_QC -g example_file/example.gff3 -f example_file/reference.fa -o error.txt - gff3_fix -qc_r error.txt -g example_file/example.gff3 -og corrected.gff3 - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -r merged_report.txt + - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -u1 example_file/u1.txt -u2 example_file/u2.txt -r merged_report.txt + - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -u1 example_file/u1.txt -r merged_report.txt + - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -u2 example_file/u2.txt -r merged_report.txt - gff3_merge -g1 example_file/new_models_w_replace.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -r merged_report.txt -noAuto - gff3_sort -g example_file/example.gff3 -og example-sorted.gff3 - ps: Write-Host "Test scripts are finished ..." diff --git a/.travis.yml b/.travis.yml index 2fa79d6..8c6d02c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,9 @@ script: - gff3_QC -g example_file/example.gff3 -f example_file/reference.fa -o error.txt - gff3_fix -qc_r error.txt -g example_file/example.gff3 -og corrected.gff3 - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -r merged_report.txt + - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -u1 example_file/u1.txt -u2 example_file/u2.txt -r merged_report.txt + - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -u1 example_file/u1.txt -r merged_report.txt + - gff3_merge -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -u2 example_file/u2.txt -r merged_report.txt - gff3_merge -g1 example_file/new_models_w_replace.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -r merged_report.txt -noAuto - gff3_sort -g example_file/example.gff3 -og example-sorted.gff3 - gff3_to_fasta -g example_file/example.gff3 -f example_file/reference.fa -st all -d simple -o test_sequences diff --git a/example_file/u1.txt b/example_file/u1.txt new file mode 100644 index 0000000..dec819a --- /dev/null +++ b/example_file/u1.txt @@ -0,0 +1,3 @@ +transcript exon +mRNA CDS +mRNA exon \ No newline at end of file diff --git a/example_file/u2.txt b/example_file/u2.txt new file mode 100644 index 0000000..f58509d --- /dev/null +++ b/example_file/u2.txt @@ -0,0 +1,2 @@ +mRNA CDS +mRNA exon \ No newline at end of file diff --git a/gff3tool/lib/gff3_merge/auto_replace_tag.py b/gff3tool/lib/gff3_merge/auto_replace_tag.py index 40e1e23..036e4c5 100755 --- a/gff3tool/lib/gff3_merge/auto_replace_tag.py +++ b/gff3tool/lib/gff3_merge/auto_replace_tag.py @@ -76,7 +76,15 @@ def main(gff1, gff2, fasta, outdir, scode, logger, all_assign=False, user_define transcripts.add(id) gff2_transcripts_type = set() if user_defined2 is None: + roots = [] for line in gff3_2.lines: + try: + if line['line_type'] == 'feature': + if 'Parent' not in line['attributes'] and len(line['attributes']) != 0: + roots.append(line) + except KeyError: + pass + for root in roots: for child in root['children']: if 'type' in child: gff2_transcripts_type.add(child['type']) From 057792e9f14b6ea47600c889b3f4bb03d90f63d3 Mon Sep 17 00:00:00 2001 From: dytk2134 Date: Thu, 6 Sep 2018 07:42:54 -0400 Subject: [PATCH 2/2] update Merge-two-GFF3-files.md --- docs/Merge-two-GFF3-files.md | 51 +++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/docs/Merge-two-GFF3-files.md b/docs/Merge-two-GFF3-files.md index fc9de2f..d4c25eb 100644 --- a/docs/Merge-two-GFF3-files.md +++ b/docs/Merge-two-GFF3-files.md @@ -8,6 +8,8 @@ [Automatically assigning replace tags](#automatically-assigning-replace-tags) +[Rules for using user-defined files](#rules-for-using-user-defined-files) + [Rules for adding a replace tag on your own](#rules-for-adding-a-replace-tag-on-your-own) [Replacing and adding models with multiple isoforms](#replacing-and-adding-models-with-multiple-isoforms) @@ -70,13 +72,56 @@ LGIB01000001.1 Gnomon CDS 359515 359920 . - 1 ID=cds33 ### Automatically assigning replace tags ([back](#table-of-contents)) -You can choose to have the program auto-assign [replace tags](#replace-tags) for you. (This is the default behavior.) **The auto-assignment program ONLY works for mRNA features.** For all other feature types, if there is no replace tag, the program will add 'replace=NA'. The program will identify which mRNA models from the modified GFF3 file overlap in coding sequence with models from the reference GFF3 file. The program will add a 'replace' attribute with the IDs of overlapping models. Specifically, the program will do the following: -- Extract CDS and pre-mRNA sequences from mRNA features from both GFF3 files. -- Use blastn to determine which sequences from the modified and reference GFF3 file align to each other **in their coding sequence**. These parameters are used: `-evalue 1e-10 -penalty -15 -ungapped` +You can choose to have the program auto-assign [replace tags](#replace-tags) for you. (This is the default behavior.) The program will identify which models from the modified GFF3 file overlap in coding/non-coding sequence with models from the reference GFF3 file. The program will add a 'replace' attribute with the IDs of overlapping models. Specifically, the program will do the following: +- Extract CDS and pre-mRNA sequences from mRNA features from both GFF3 files. (For all other feature types, this program will extract transcript and pre-transcript from both GFF3 files) +- Use blastn to determine which sequences from the modified and reference GFF3 file align to each other **in their coding/non-coding sequence**. These parameters are used: `-evalue 1e-10 -penalty -15 -ungapped` - If two models pass the alignment step, the program will add a 'replace' attribute with the ID of each overlapping model to the modified gff3 file. - If no reference model overlaps with a new model, then the program will add 'replace=NA'. - If one model overlaps another in an intron or UTR (but not within the coding sequence), the auto-assignment program will NOT assign a replace tag. This is because it's not always clear whether the overlapping model should be replaced. You will receive a warning message that this model does not have a replace tag and therefore was not incorporated into the merged gff3 file. You can then go back and manually add a replace tag to the original gff3 file. +### Rules for using user-defined files +([back](#table-of-contents)) + +By default, the program will only use exon to generate spliced sequences for transcripts. If you choose to have the program auto-assign replace tags but there is a model without exon features in your GFF3 files, then you must generate user-defined files for specifying parent and child features for sequences extraction. + +**Example**, a user-defined file for extracting CDS sequences from mRNA, using exon to generate spliced sequences for miRNA and using pseudogenic_exon to generate spliced sequences for pseudogenic_transcript. + +User-defined file: +``` +mRNA CDS +miRNA exon +pseudogenic_transcript pseudogenic_exon +``` + +**Usage**: The user-defined can be specified via **--user_defined_file1** and **--user_defined_file2** argument. You can either give --user_defined_file1 for sequences extraction from updated GFF3 file or give --user_defined_file2 for sequences extraction from reference GFF3 file. Then, the program will use blastn to determine which sequences from the updated and reference GFF3 file align to each other. Specifically, the program will do the blastn with the following query and subject sequences: + +- If **--user_defined_file1** is given + +Query sequence | Subject sequence +--- | --- +user-defined sequences from updated GFF3 file | CDS sequences from reference GFF3 file +user-defined sequences from updated GFF3 file | transcript sequences from reference GFF3 file +pre-transcript sequences from updated GFF3 file | pre-transcript from reference GFF3 file + +- If **--user_defined_file2** is given + +Query sequence | Subject sequence +--- | --- +CDS sequences from updated GFF3 file | user-defined sequences from reference GFF3 file +transcript sequences from updated GFF3 file | user-defined sequences from reference GFF3 file +pre-transcript sequences from updated GFF3 file | pre-transcript from reference GFF3 file + +- If both **--user_defined_file1** and **--user_defined_file2** are given + +Query sequence | Subject sequence +--- | --- +user-defined sequences from updated GFF3 file | user-defined sequences from reference GFF3 file +pre-transcript sequences from updated GFF3 file | pre-transcript from reference GFF3 file + +**Note**: +- About the parent-child pair, the parent feature should be a transcript (e.g. mRNA, ncRNA) and the child feature is its children (e.g. exon, CDS). +- This program will only generate sequences for the parent-child pair in the user-defined file. + ### Rules for adding a replace tag on your own ([back](#table-of-contents))