biocore · josenavas · Mar 31, 2017 · Mar 29, 2017 · Mar 29, 2017 · Mar 29, 2017
diff --git a/knimin/handlers/ag_pulldown.py b/knimin/handlers/ag_pulldown.py
@@ -1,5 +1,7 @@
 from tornado.web import authenticated
 from future.utils import viewitems
+from StringIO import StringIO
+import pandas as pd
 
 from knimin.handlers.base import BaseHandler
 from knimin import db
@@ -13,17 +15,20 @@ class AGPulldownHandler(BaseHandler):
     def get(self):
         surveys = db.list_external_surveys()
         self.render("ag_pulldown.html", currentuser=self.current_user,
-                    barcodes=[], surveys=surveys, errors='')
+                    barcodes=[], surveys=surveys, errors='',
+                    agsurveys=db.list_ag_surveys(), merged='False')
 
     @authenticated
     def post(self):
         # Do nothing if no file given
         if 'barcodes' not in self.request.files:
             surveys = db.list_external_surveys()
+            ags = db.list_ag_surveys(map(int, self.get_arguments('agsurveys')))
             self.render("ag_pulldown.html", currentuser=self.current_user,
                         barcodes='', blanks='', external='', surveys=surveys,
                         errors="No barcode file given, thus nothing could "
-                               "be pulled down.")
+                               "be pulled down.", agsurveys=ags,
+                        merged=self.get_argument('merged', default='False'))
             return
         # Get file information, ignoring commented out lines
         fileinfo = self.request.files['barcodes'][0]['body']
@@ -41,9 +46,12 @@ def post(self):
         else:
             external = ''
         surveys = db.list_external_surveys()
+        ags = db.list_ag_surveys(map(int, self.get_arguments('agsurveys')))
         self.render("ag_pulldown.html", currentuser=self.current_user,
                     barcodes=",".join(barcodes), blanks=",".join(blanks),
-                    surveys=surveys, external=external, errors='')
+                    surveys=surveys, external=external, errors='',
+                    agsurveys=ags,
+                    merged=self.get_argument('merged', default='False'))
 
 
 @set_access(['Metadata Pulldown'])
@@ -55,10 +63,19 @@ def post(self):
             blanks = self.get_argument('blanks').split(',')
         else:
             blanks = []
-        if self.get_argument('external'):
+
+        # query which surveys have been selected by the user
+        if self.get_argument('selected_ag_surveys', []):
+            selected_ag_surveys = map(int, self.get_argument(
+                'selected_ag_surveys').split(','))
+        else:
+            selected_ag_surveys = []
+
+        if self.get_argument('external', []):
             external = self.get_argument('external').split(',')
         else:
             external = []
+
         # Get metadata and create zip file
         metadata, failures = db.pulldown(barcodes, blanks, external)
 
@@ -67,8 +84,43 @@ def post(self):
         failtext = ("The following barcodes were not retrieved "
                     "for any survey:\n%s" % failed)
         meta_zip.append("failures.txt", failtext)
+
+        # check database about what surveys are available
+        available_agsurveys = {}
+        for (id, name, selected) in db.list_ag_surveys():
+            available_agsurveys[id] = name.replace(' ', '_')
+
+        results_as_pd = []
         for survey, meta in viewitems(metadata):
-            meta_zip.append('survey_%s_md.txt' % survey, meta)
+            # only create files for those surveys that have been selected by
+            # the user. Note that ids from the DB are negative, in metadata
+            # they are positive!
+            # Currently, I (Stefan Janssen) don't have test data for external
+            # surveys, thus I don't know their 'survey' value. I expect it to
+            # be the name of the external survey. In order to not block their
+            # pulldown I check that a skipped survey ID must be in the set of
+            # all available surveys.
+            abs_survey = abs(survey)
+            if (abs_survey in selected_ag_surveys) or \
+               (abs_survey not in available_agsurveys):
+                meta_zip.append('survey_%s_md.txt' %
+                                available_agsurveys[-1 * survey], meta)
+                # transform each survey into a pandas dataframe for later merge
+                # read all columns as string to avoid unintened conversions,
+                # like cutting leading zeros of barcodes
+                pd_meta = pd.read_csv(StringIO(meta), sep="\t", dtype=str)
+                # reset the index to barcodes = here sample_name
+                pd_meta.set_index('sample_name', inplace=True)
+                results_as_pd.append(pd_meta)
+
+        # add the merged table of all selected surveys to the zip archive
+        if self.get_argument('merged', default='False') == 'True':
+            pd_all = pd.DataFrame()
+            if len(results_as_pd) > 0:
+                pd_all = pd.concat(results_as_pd, join='outer', axis=1)
+                meta_zip.append('surveys_merged_md.txt',
+                                pd_all.to_csv(sep='\t',
+                                              index_label='sample_name'))
 
         # write out zip file
         self.add_header('Content-type',  'application/octet-stream')

diff --git a/knimin/lib/data_access.py b/knimin/lib/data_access.py
@@ -1685,6 +1685,30 @@ def add_external_survey(self, survey, description, url):
                  RETURNING external_survey_id"""
         return self._con.execute_fetchone(sql, [survey, description, url])[0]
 
+    def list_ag_surveys(self, selected=None):
+        """Returns the list of american gut survey names.
+
+        Parameters
+        ----------
+        selected : list of int
+            The returned list's third element indicates if a survey has been
+            "chosen". If selected is None, all surveys will be "chosen",
+            otherwise only surveys whose ID is in selected are "chosen".
+
+        Returns
+        -------
+        list of (int, str, bool)
+            first element is the group_order number
+            second element is the group name
+            third element is the information if user selected this survey for
+                  pulldown. All surveys are selected if selected is None.
+        """
+        sql = """SELECT group_order, american
+                FROM ag.survey_group
+                WHERE group_order < 0"""
+        return [(id_, name, (selected is None) or (id_ in selected))
+                for [id_, name] in self._con.execute_fetchall(sql)]
+
     def list_external_surveys(self):
         """Returns list of external survey names
 

diff --git a/knimin/lib/mem_zip.py b/knimin/lib/mem_zip.py
@@ -61,6 +61,45 @@ def write_to_buffer(self):
         return self.in_memory_data.getvalue()
 
 
+def extract_zip(input_zip):
+    """ Reads all files of a zip file from disk.
+
+    A helper function to read in all files of a zip archive as strings and
+    return a dict of those strings where the keys are the filenames.
+
+    Parameters
+    ----------
+    input_zip : str
+        The filename of the archive.
+
+    Returns
+    -------
+    A dict of str: keys = filenames in archive, values = content of files
+    """
+
+    input_zip = zipfile.ZipFile(input_zip)
+    return {name: input_zip.read(name) for name in input_zip.namelist()}
+
+
+def sneak_files(archive, len=1000):
+    """ Returns the first characters of each file in an zip archive.
+
+    Parameters
+    ----------
+    archive : dict{str : filename, str : filecontents}
+        The already extracted zip archive in form of a dict, where keys are
+        filenames and values are the content of the file.
+    len : int
+        Number of characters returned from each file.
+        Default: 1000.
+
+    Returns
+    -------
+    dict{str, str} where the first component is the filename and the second
+    the first <len> characters of the file."""
+    return map(lambda (k, v): {k: v[:len]}, archive.items())
+
+
 if __name__ == "__main__":
     # Run a test
     imz = InMemoryZip()

diff --git a/knimin/lib/tests/test_data_access.py b/knimin/lib/tests/test_data_access.py
@@ -272,55 +272,54 @@ def test_get_ag_barcode_details(self):
         obs = db.get_ag_barcode_details(['000018046'])
         ag_login_id = '0060a301-e5bf-6a4e-e050-8a800c5d49b7'
         exp = {'000018046': {
-                'ag_kit_barcode_id': '0060a301-e5c1-6a4e-e050-8a800c5d49b7',
-                'verification_email_sent': 'n',
-                'pass_reset_code': None,
-                'vioscreen_status': 3,
-                'sample_barcode_file': '000018046.jpg',
-                'environment_sampled': None,
-                'supplied_kit_id': db.ut_get_supplied_kit_id(ag_login_id),
-                'withdrawn': None,
-                'kit_verified': 'y',
-                # 'city': 'REMOVED',
-                'ag_kit_id': '0060a301-e5c0-6a4e-e050-8a800c5d49b7',
-                # 'zip': 'REMOVED',
-                'ag_login_id': ag_login_id,
-                # 'state': 'REMOVED',
-                'results_ready': 'Y',
-                'moldy': 'N',
-                # The key 'registered_on' is a time stamp when the database is
-                # created. It is unique per deployment.
-                # 'registered_on': datetime.datetime(2016, 8, 17, 10, 47, 2,
-                #                                   713292),
-                # 'kit_password': ('$2a$10$2.6Y9HmBqUFmSvKCjWmBte70WF.zd3h4Vqb'
-                #                  'hLMQK1xP67Aj3rei86'),
-                # 'deposited': False,
-                'sample_date': datetime.date(2014, 8, 13),
-                # 'email': 'REMOVED',
-                'print_results': False,
-                'open_humans_token': None,
-                # 'elevation': 0.0,
-                'refunded': None,
-                # 'other_text': 'REMOVED',
-                'barcode': '000018046',
-                'swabs_per_kit': 1L,
-                # 'kit_verification_code': '60260',
-                # 'latitude': 0.0,
-                'cannot_geocode': None,
-                # 'address': 'REMOVED',
-                'date_of_last_email': datetime.date(2014, 8, 15),
-                'site_sampled': 'Stool',
-                # 'name': 'REMOVED',
-                'sample_time': datetime.time(11, 15),
-                # 'notes': 'REMOVED',
-                'overloaded': 'N',
-                # 'longitude': 0.0,
-                'pass_reset_time': None,
-                # 'country': 'REMOVED',
-                'survey_id': '084532330aca5885',
-                'other': 'N',
-                'sample_barcode_file_md5': None
-        }}
+               'ag_kit_barcode_id': '0060a301-e5c1-6a4e-e050-8a800c5d49b7',
+               'verification_email_sent': 'n',
+               'pass_reset_code': None,
+               'vioscreen_status': 3,
+               'sample_barcode_file': '000018046.jpg',
+               'environment_sampled': None,
+               'supplied_kit_id': db.ut_get_supplied_kit_id(ag_login_id),
+               'withdrawn': None,
+               'kit_verified': 'y',
+               # 'city': 'REMOVED',
+               'ag_kit_id': '0060a301-e5c0-6a4e-e050-8a800c5d49b7',
+               # 'zip': 'REMOVED',
+               'ag_login_id': ag_login_id,
+               # 'state': 'REMOVED',
+               'results_ready': 'Y',
+               'moldy': 'N',
+               # The key 'registered_on' is a time stamp when the database is
+               # created. It is unique per deployment.
+               # 'registered_on': datetime.datetime(2016, 8, 17, 10, 47, 2,
+               #                                   713292),
+               # 'kit_password': ('$2a$10$2.6Y9HmBqUFmSvKCjWmBte70WF.zd3h4Vqb'
+               #                  'hLMQK1xP67Aj3rei86'),
+               # 'deposited': False,
+               'sample_date': datetime.date(2014, 8, 13),
+               # 'email': 'REMOVED',
+               'print_results': False,
+               'open_humans_token': None,
+               # 'elevation': 0.0,
+               'refunded': None,
+               # 'other_text': 'REMOVED',
+               'barcode': '000018046',
+               'swabs_per_kit': 1L,
+               # 'kit_verification_code': '60260',
+               # 'latitude': 0.0,
+               'cannot_geocode': None,
+               # 'address': 'REMOVED',
+               'date_of_last_email': datetime.date(2014, 8, 15),
+               'site_sampled': 'Stool',
+               # 'name': 'REMOVED',
+               'sample_time': datetime.time(11, 15),
+               # 'notes': 'REMOVED',
+               'overloaded': 'N',
+               # 'longitude': 0.0,
+               'pass_reset_time': None,
+               # 'country': 'REMOVED',
+               'survey_id': '084532330aca5885',
+               'other': 'N',
+               'sample_barcode_file_md5': None}}
         participant_names = db.ut_get_participant_names_from_ag_login_id(
             ag_login_id)
         for key in obs:
@@ -329,6 +328,23 @@ def test_get_ag_barcode_details(self):
             self.assertEqual({k: obs[key][k] for k in exp[key]}, exp[key])
             self.assertIn(obs[key]['participant_name'], participant_names)
 
+    def test_list_ag_surveys(self):
+        truth = [(-1, 'Personal Information', True),
+                 (-2, 'Pet Information', True),
+                 (-3, 'Fermented Foods', True),
+                 (-4, 'Surfers', True),
+                 (-5, 'Personal_Microbiome', True)]
+        for survey in truth:
+            self.assertIn(survey, db.list_ag_surveys())
+
+        truth = [(-1, 'Personal Information', False),
+                 (-2, 'Pet Information', True),
+                 (-3, 'Fermented Foods', False),
+                 (-4, 'Surfers', True),
+                 (-5, 'Personal_Microbiome', False)]
+        for survey in truth:
+            self.assertIn(survey, db.list_ag_surveys([-2, -4]))
+
 
 if __name__ == "__main__":
     main()
diff --git a/knimin/lib/tests/test_mem_zip.py b/knimin/lib/tests/test_mem_zip.py
@@ -1,8 +1,9 @@
 import unittest
-from knimin.lib.mem_zip import InMemoryZip
+from knimin.lib.mem_zip import InMemoryZip, extract_zip, sneak_files
 import zipfile
 import os
 import io
+from os.path import join, dirname, realpath
 
 
 class TestInMemoryZip(unittest.TestCase):
@@ -50,6 +51,33 @@ def test_write_to_buffer(self):
         res_contents = zhandle.read(self.test_fname)
         self.assertEqual(res_contents, exp_contents)
 
+    def test_extract_zip(self):
+        fp_zip = join(dirname(realpath(__file__)), '..', '..', 'tests', 'data',
+                      'results_multiplesurvey_barcodes.zip')
+        obs = extract_zip(fp_zip)
+        exp_filenames = ['failures.txt', 'survey_Fermented_Foods_md.txt',
+                         'survey_Personal_Information_md.txt',
+                         'survey_Personal_Microbiome_md.txt',
+                         'survey_Pet_Information_md.txt',
+                         'surveys_merged_md.txt', 'survey_Surfers_md.txt']
+        # check filenames
+        self.assertEqual(sorted(obs.keys()), sorted(exp_filenames))
+        # check file contents very briefly
+        self.assertIn('SURF_BOARD_TYPE', obs['survey_Surfers_md.txt'])
+
+    def test_sneak_files(self):
+        fp_zip = join(dirname(realpath(__file__)), '..', '..', 'tests', 'data',
+                      'results_multiplesurvey_barcodes.zip')
+        exp = [{'survey_Personal_Microbiome_md.txt': 'sample_name\tPM_AGE\tP'},
+               {'survey_Pet_Information_md.txt': 'sample_name\tALTITUDE'},
+               {'failures.txt': 'The following barcod'},
+               {'survey_Fermented_Foods_md.txt': 'sample_name\tFERMENTE'},
+               {'survey_Surfers_md.txt': 'sample_name\tSURF_BOA'},
+               {'survey_Personal_Information_md.txt': 'sample_name\tACNE_MED'},
+               {'surveys_merged_md.txt': 'sample_name\tACNE_MED'}]
+        obs = sneak_files(extract_zip(fp_zip), 20)
+        self.assertEqual(exp, obs)
+
 
 if __name__ == '__main__':
     unittest.main()