Merge pull request #158 from tiagofilipe12/backend_1.6.0

Backend 1.6.0
tiagofilipe12 · Sep 25, 2018 · d04e2af · d04e2af
2 parents fc1d59d + 878da31
commit d04e2af
Show file tree

Hide file tree

Showing 24 changed files with 287 additions and 122 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,8 +1,14 @@
 # Changelog
 
-## Upcoming version (1.6.0)
+## Version 1.6.0
 
-### Database
+### Back end
+- Added code that allows to save a list of all the accession numbers to a file,
+so that future changes to the database can be easily documented.
+- Added black list for accession numbers that are reported to be misplaced as 
+plasmids in refseq database.
+- Update database to NCBI refseq 091418.
+- Added first implementation to parse result from plasmidfinder new database.
 
 ### Front end
 
@@ -30,6 +36,10 @@ multiple nodes.
 modals.
 - Implemented highlight and filter for all node selections (taxa, resistances,
 plasmid families, virulence and combined selections).
+- Added faq on how to report sequences that aren't plasmids.
+- Removed histogram from length plot.
+- Added new button that allow users to more easily report a sequence, by using
+github api for pre-filled issues.
 
 #### Bug fixes
 - Fixed minor issues after filtering datasets for link selections and for shift

diff --git a/docs/Sidebar.md b/docs/Sidebar.md
@@ -41,7 +41,33 @@ that display pATLAS key features.
 
 #### FAQs
 
-A set of questions that may occur to users when using pATLAS
+A set of questions that may occur to users when using pATLAS. 
+
+#### Report sequence.
+
+Here users can report any problem that they find with a plasmid available in 
+pATLAS. For instance, if there is a gene sequence rather than a plasmid. This
+is part of the _crowd curation_ of pATLAS, which is an initiative that aims to
+ease the curation of the database by users that find an issue with a RefSeq
+sequence that shouldn't be a plasmid. pATLAS already has some filters that
+prevent non-plasmid sequences from getting into the database, however 
+in each database update it is expected that new issues may arise, thus we
+acknowledge every contribution to help us curate the database.
+
+When clicked, this button will open a pre-formatted GitHub issue like the one below:
+
+![](gitbook/images/gh_issue.png)
+
+The user reporting will have to replace `<brief title of the issue>` with the
+desired title and then the body of the issue is already pre-filled with
+two headers:
+
+* `Sequences accessions` - In which the users should state the accession numbers
+of the sequences that have issues.
+
+* `Description of the issue` - Here the users should state the reason by which 
+the sequence should be removed/curated from the pATLAS plasmid database.
+
 
 ## About
 

diff --git a/patlas/MASHix.py b/patlas/MASHix.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 
-## Last update: 11/6/2018
-## Author: T.F. Jesus
-## This script runs MASH in plasmid databases making a pairwise diagonal matrix
-## for each pairwise comparison between libraries
-## Note: each header in fasta is considered a reference
+# Last update: 14/8/2018
+# Author: T.F. Jesus
+# This script runs MASH in plasmid databases making a pairwise diagonal matrix
+# for each pairwise comparison between libraries
+# Note: each header in fasta is considered a reference
 
 import argparse
 import sys
@@ -21,16 +21,17 @@
 try:
     from utils.hist_util import plot_histogram
     from utils.taxa_fetch import executor
+    from utils.crowd_curation import black_list
     from db_manager.db_app import db, models
 except ImportError:
     from patlas.utils.hist_util import plot_histogram
     from patlas.utils.taxa_fetch import executor
+    from patlas.utils.crowd_curation import black_list
     from patlas.db_manager.db_app import db, models
 
-# This is a rather sketchy solution TODO remove this with a refactor of node_crawler
+# TODO This is a rather sketchy solution, remove this with a refactor of node_crawler
 sys.setrecursionlimit(10000)
 
-
 class Record:
 
     def __init__(self, accession, size, distance, percentage_hashes):
@@ -103,7 +104,6 @@ def output_tree(infile, tag):
 
     """
 
-
     mother_directory = os.path.join(os.path.dirname(os.path.abspath(infile)),
                                     tag)
     dirs = ["", "tmp", "results", "reference_sketch", "genome_sketchs",
@@ -233,6 +233,10 @@ def master_fasta(fastas, output_tag, mother_directory):
 
     species_output = open(species_out, "w")
 
+    # creates a list of accession numbers, listing all accessions in
+    # input seuqences
+    all_accessions = []
+
     # sets first length instance
     length = 0
     accession = False
@@ -306,18 +310,21 @@ def master_fasta(fastas, output_tag, mother_directory):
                 plasmid_name = search_substing(line)
                 # species related functions
                 all_species.append(" ".join(species.split("_")))
+                # append accession that will be outputed to file
+                all_accessions.append(accession)
 
                 # added this if statement to check whether CDS is present in
                 # fasta header, since database contain them with CDS in string
                 if "cds" in line.lower() and line.lower().count("cds") <= 1 \
                         and "plasmid" not in line.lower():
                     truePlasmid = False
                     reason = "cds"
-                   #continue
                 elif "origin" in line.lower():
                     truePlasmid = False
                     reason = "origin"
-                    #continue
+                elif accession in black_list:
+                    truePlasmid = False
+                    reason = black_list[accession]
                 else:
                     truePlasmid = True
 
@@ -362,6 +369,14 @@ def master_fasta(fastas, output_tag, mother_directory):
     # writes a species list to output file
     species_output.write("\n".join(str(i) for i in list(set(all_species))))
     species_output.close()
+
+    # write accessions to a file
+    accession_out = os.path.join(mother_directory,
+                                 "accessions_list_{}.lst".format(output_tag))
+    with open(accession_out, "w") as fh:
+        fh.write("version: 1.5.2\n")
+        fh.write("\n".join(all_accessions))
+
     return out_file, sequence_info, all_species
 
 
@@ -873,7 +888,8 @@ def main():
 
     mash_options = parser.add_argument_group("MASH related options")
     mash_options.add_argument("-k", "--kmers", dest="kmer_size", default="21",
-                              help="Provide the number of k-mers to be provided to mash "
+                              help="Provide the number of k-mers to be provided"
+                                   " to mash "
                                    "sketch. Default: 21.")
     mash_options.add_argument("-p", "--pvalue", dest="pvalue",
                               default="0.05", help="Provide the p-value to "
@@ -888,8 +904,8 @@ def main():
     other_options = parser.add_argument_group("Other options")
     other_options.add_argument("-rm", "--remove", dest="remove",
                                action="store_true", help="Remove any temporary "
-                                                         "files and folders not "
-                                                         "needed (not present "
+                                                         "files and folders not"
+                                                         " needed (not present "
                                                          "in results "
                                                          "subdirectory).")
     other_options.add_argument("-hist", "--histograms", dest="histograms",
@@ -911,8 +927,8 @@ def main():
                                help="this option allows to only run the part "
                                     "of the script that is required to "
                                     "generate the filtered fasta. Allowing for "
-                                    "instance to debug sequences that shoudn't "
-                                    "be removed using 'cds' and 'origin' "
+                                    "instance to debug sequences that shouldn't"
+                                    " be removed using 'cds' and 'origin' "
                                     "keywords")
 
     args = parser.parse_args()
@@ -929,23 +945,22 @@ def main():
     names_file = args.names_file
     nodes_file = args.nodes_file
 
-    ## lists all fastas given to argparser
+    # lists all fastas given to argparser
     fastas = [f for f in args.inputfile if f.endswith((".fas", ".fasta",
                                                        ".fna", ".fsa", ".fa"))]
 
-    ## creates output directory tree
-    output_tag = args.output_tag.replace("/", "")  ## if the user gives and
+    # creates output directory tree
+    output_tag = args.output_tag.replace("/", "")  # if the user gives and
     # input tag that is already a folder
     mother_directory = output_tree(fastas[0], output_tag)
 
-    ## checks if multiple fastas are provided or not avoiding master_fasta
+    # checks if multiple fastas are provided or not avoiding master_fasta
     # function
     print("***********************************")
     print("Creating main database...\n")
     main_fasta, sequence_info, all_species = master_fasta(fastas, output_tag,
                                              mother_directory)
 
-
     # if the parameter sequences_to_remove is provided the script will only
     # generate the fasta files and a list of the sequences that were removed
     # from ncbi refseq original fasta.
@@ -954,11 +969,11 @@ def main():
               "Leaving script...")
         sys.exit(0)
 
-    #########################
-    ### genera block here ###
-    #########################
+    #####################
+    # genera block here #
+    #####################
 
-    ## runs mash related functions
+    # runs mash related functions
     print("***********************************")
     print("Sketching reference...\n")
     ref_sketch = sketch_references(main_fasta, output_tag, threads, kmer_size,
@@ -969,7 +984,7 @@ def main():
     print("Making temporary files for each genome in fasta...\n")
     genomes = genomes_parser(main_fasta, mother_directory)
 
-    ## This must be multiprocessed since it is extremely fast to do mash
+    # This must be multiprocessed since it is extremely fast to do mash
     # against one plasmid sequence
     print("***********************************")
     print("Sketching genomes and running mash distances...\n")
@@ -979,7 +994,7 @@ def main():
                                      output_tag, kmer_size, mother_directory),
                              genomes)  # process genomes iterable with pool
 
-    ## loop to print a nice progress bar
+    # loop to print a nice progress bar
     try:
         for _ in tqdm.tqdm(mp, total=len(genomes)):
             pass
@@ -991,7 +1006,7 @@ def main():
     # remaining options are triggered
     print("\nFinished MASH... uf uf uf!")
 
-    ## Makes distances matrix csv file
+    # Makes distances matrix csv file
     print("\n***********************************")
     print("Creating distance matrix...\n")
     lists_traces = mash_distance_matrix(mother_directory, sequence_info,

diff --git a/patlas/db_manager/db_app/resources.py b/patlas/db_manager/db_app/resources.py
@@ -85,7 +85,6 @@ class GetResistances(Resource):
     def post(self):
         var_response = request.form["accession"].replace("[", "")\
             .replace("]", "").replace('"', "").split(",")
-        print(var_response)
         single_query = db.session.query(Card).filter(
             Card.plasmid_id.in_(var_response)).all()
         return single_query
@@ -149,13 +148,15 @@ def get(self):
         args = req_parser.parse_args()
         # This queries name object in json_entry and retrieves an array with
         # all objects that matched the args (json_entry, plasmid_id)
+        parsed_gene = args.gene.replace('"', '')    # TODO parser for new plasmidfinder db
         records = db.session.query(Database).filter(
-            Database.json_entry["gene"].astext.contains(args.gene)
+            Database.json_entry["gene"].astext.contains(parsed_gene)
         ).all()
         # contains method allows us to query in array that is converted to a
         # string
         return records
 
+
 class GetAccessionVir(Resource):
     @marshal_with(card_field)
     def get(self):

diff --git a/patlas/db_manager/db_app/static/js/download/abrPlusFamilies.js b/patlas/db_manager/db_app/static/js/download/abrPlusFamilies.js
@@ -390,12 +390,37 @@ const plasmidFamilyGetter = (nodeId) => {
 
     try{
       // totalLength array corresponds to gene names
-      const totalLength = data[0].json_entry.gene.replace(/['u\[\] ]/g, "").split(",")
-      const accessionList = data[0].json_entry.accession.replace(/['u\[\] ]/g, "").split(",")
+      let totalLength = data[0].json_entry.gene.replace(/['u\[\] ]/g, "").split(",")
+      let accessionList = data[0].json_entry.accession.replace(/['u\[\] ]/g, "").split(",")
       const coverageList = data[0].json_entry.coverage.replace(/['u\[\] ]/g, "").split(",")
       const identityList = data[0].json_entry.identity.replace(/['u\[\] ]/g, "").split(",")
       const rangeList = data[0].json_entry.seq_range.replace("[[", "[").replace("]]", "]").split("],")
 
+      // TODO parser required for new plasmidfinder db
+      if (accessionList[0] === "") {
+
+        accessionList = totalLength.map( (el) => {
+          const length_split = el.split("_")
+          if (length_split.indexOf("NC") > 0) {
+            return length_split.slice(length_split.length - 2).join("_")
+          } else {
+            return length_split.slice(length_split.length - 1).join("_")
+          }
+        })
+
+        totalLength = totalLength.map( (el) => {
+          const length_split = el.split("_")
+          // check if there is a NC
+          if (length_split.indexOf("NC") > 0) {
+            return length_split.slice(0, length_split.length - 2).join("_").replace(/\_$/, "")
+          } else {
+            return length_split.slice(0, length_split.length - 1).join("_").replace(/\_$/, "")
+          }
+        })
+
+
+      }
+
       for (const i in totalLength) {
         if ({}.hasOwnProperty.call(totalLength, i)) {
 
@@ -405,7 +430,7 @@ const plasmidFamilyGetter = (nodeId) => {
 
           queryArrayPFRange.push( {
               "range": rangeEntry,
-              "genes": customTrim(totalLenght[i], "'"),
+              "genes": customTrim(totalLength[i], "'"),
               "accessions": makeItClickable(accessionList[i].split(":")[0]),
               "coverage": coverageList[i],
               "identity": identityList[i]
@@ -510,7 +535,7 @@ const virulenceGetter = (nodeId) => {
           queryArrayVirRange.push(
             {
               "range": rangeEntry,
-              "genes": customTrim(totalLenght[i], "'"),
+              "genes": customTrim(totalLength[i], "'"),
               "accessions": makeItClickable(accessionList[i].split(":")[0]),
               "coverage": coverageList[i],
               "identity": identityList[i]

diff --git a/patlas/db_manager/db_app/static/js/dropdowns/dropdownPopulation.js b/patlas/db_manager/db_app/static/js/dropdowns/dropdownPopulation.js
@@ -95,11 +95,22 @@ getArrayPf().done((json) => {
   // iterate over the file
   $.each(json, (accession, entry) => {
     const geneEntries = entry.gene
-    for (let i in geneEntries) {
-      if (geneEntries.hasOwnProperty(i)) {
-        if (listPF.indexOf(geneEntries[i]) < 0) {
-          listPF.push(geneEntries[i])
-        }
+    for (let i of geneEntries) {
+
+      //TODO this should be removed once plasmidefinder abricate is used - listPF.push(i), everything else should be ignored
+      const length_split = i.split("_")
+      const parsed_i = length_split
+        .slice(0, length_split.length - 1)
+        .join("_")
+        // replace every _NC in the end
+        .replace(/\_NC$/, "")
+        // then remove _ in the end of the plasmidfinder gene name
+        .replace(/\_$/, "")
+
+      // checks if entry is already in listPF and if so doesn't populate the
+      // dropdown.
+      if (listPF.indexOf(parsed_i) < 0) {
+        listPF.push(parsed_i)
       }
     }
   })

diff --git a/patlas/db_manager/db_app/static/js/input_file_handling/dropdownPopulation.js b/patlas/db_manager/db_app/static/js/input_file_handling/dropdownPopulation.js
@@ -1,4 +1,5 @@
-/*globals colorList, listGiFilter, colorNodes, legendInst, typeOfProject */
+/*globals colorList, listGiFilter, colorNodes, legendInst, typeOfProject,
+blockFilterModal */
 
 // function to remove first char from every string in array
 const removeFirstCharFromArray = (arr) => {
@@ -202,7 +203,9 @@ const pfSubmitFunction = async (g, graphics, renderer, tempPageReRun) => {
   // now processes the current selection
   const pfQuery = document.getElementById("p_PlasmidFinder").innerHTML
 
-  let selectedPf = pfQuery.replace("PlasmidFinder:", "").split(",").filter(Boolean)
+  let selectedPf = pfQuery.replace("PlasmidFinder:", "")
+    .split(",")
+    .filter(Boolean)
 
   selectedPf = removeFirstCharFromArray(selectedPf)
 

diff --git a/patlas/db_manager/db_app/static/js/node_handling/advancedFilters.js b/patlas/db_manager/db_app/static/js/node_handling/advancedFilters.js
@@ -1,4 +1,5 @@
-/*globals speciesRequest, taxaRequest, resRequest, pfRequest, virRequest, listGiFilter, colorNodes, selectedFilter */
+/*globals speciesRequest, taxaRequest, resRequest, pfRequest, virRequest,
+listGiFilter, colorNodes, selectedFilter, blockFilterModal*/
 
 /**
  * Function to calculate intersection between arrays. Note that this function