Skip to content
This repository has been archived by the owner on Jul 29, 2020. It is now read-only.

Pulldown: select surveys to be exported + merged table #163

Merged
merged 28 commits into from
Mar 31, 2017
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
41068be
new test data
sjanssen2 Mar 29, 2017
5b0b4d4
new function to extract a ZIP archive + unit tests
sjanssen2 Mar 29, 2017
8e7a61b
new function to return the list of internal surveys + unit tests
sjanssen2 Mar 29, 2017
de6ade8
updated test data to new scrubbed DB version
sjanssen2 Mar 29, 2017
55bb5c1
adapted test data to new scrubbed DB
sjanssen2 Mar 29, 2017
58e61d0
work in progress
sjanssen2 Mar 29, 2017
e667921
Merge branch 'master' of https://github.com/biocore/labadmin into fix…
sjanssen2 Mar 29, 2017
36b88a6
updated test data
sjanssen2 Mar 29, 2017
43f32d4
made parameter optional + check empty result list
sjanssen2 Mar 29, 2017
1bf5fc0
adapted to scrubbed DB
sjanssen2 Mar 29, 2017
a99666a
updated groud truth
sjanssen2 Mar 29, 2017
f980040
update in test
sjanssen2 Mar 29, 2017
ba14da3
a new function that only returns the first X char of each file in an …
sjanssen2 Mar 29, 2017
afabe84
comparing only first 1000 chars of each file
sjanssen2 Mar 29, 2017
a4c6729
docstr
sjanssen2 Mar 29, 2017
c7f43ce
addressing Jose's comments
sjanssen2 Mar 30, 2017
64cbb39
clarification in docstring + avoid using keyword id
sjanssen2 Mar 30, 2017
6401a93
added a forgotten parameter at docstring
sjanssen2 Mar 30, 2017
903212c
fixed a typo
sjanssen2 Mar 30, 2017
7b1d57d
Merge branch 'master' of https://github.com/biocore/labadmin into add…
sjanssen2 Mar 30, 2017
0d81e3a
corrected passing arguments to handler, i.e. by using lists
sjanssen2 Mar 30, 2017
f9e3f66
getargument_s completely avoids split(',') issues
sjanssen2 Mar 30, 2017
ff7aee5
new ground truth
sjanssen2 Mar 30, 2017
ccd0eac
shit: get_parameters would be nice, but the HTML page provides a stri…
sjanssen2 Mar 30, 2017
8051f70
using Jose's style
sjanssen2 Mar 30, 2017
003fed4
also testing HTML parameter passing
sjanssen2 Mar 30, 2017
7d49672
try to reduce complexity
sjanssen2 Mar 30, 2017
dc55575
rename var list -> _list + unit tests
sjanssen2 Mar 31, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 75 additions & 13 deletions knimin/handlers/ag_pulldown.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from tornado.web import authenticated
from future.utils import viewitems
from StringIO import StringIO
import pandas as pd

from knimin.handlers.base import BaseHandler
from knimin import db
Expand All @@ -13,17 +15,20 @@ class AGPulldownHandler(BaseHandler):
def get(self):
surveys = db.list_external_surveys()
self.render("ag_pulldown.html", currentuser=self.current_user,
barcodes=[], surveys=surveys, errors='')
barcodes=[], surveys=surveys, errors='',
agsurveys=db.list_ag_surveys(), merged='False')

@authenticated
def post(self):
# Do nothing if no file given
if 'barcodes' not in self.request.files:
surveys = db.list_external_surveys()
ags = db.list_ag_surveys(map(int, self.get_arguments('agsurveys')))
self.render("ag_pulldown.html", currentuser=self.current_user,
barcodes='', blanks='', external='', surveys=surveys,
errors="No barcode file given, thus nothing could "
"be pulled down.")
"be pulled down.", agsurveys=ags,
merged=self.get_argument('merged', default='False'))
return
# Get file information, ignoring commented out lines
fileinfo = self.request.files['barcodes'][0]['body']
Expand All @@ -41,24 +46,27 @@ def post(self):
else:
external = ''
surveys = db.list_external_surveys()
ags = db.list_ag_surveys(map(int, self.get_arguments('agsurveys')))
self.render("ag_pulldown.html", currentuser=self.current_user,
barcodes=",".join(barcodes), blanks=",".join(blanks),
surveys=surveys, external=external, errors='')
surveys=surveys, external=external, errors='',
agsurveys=ags,
merged=self.get_argument('merged', default='False'))


@set_access(['Metadata Pulldown'])
class AGPulldownDLHandler(BaseHandler):
@authenticated
def post(self):
barcodes = self.get_argument('barcodes').split(',')
if self.get_argument('blanks'):
blanks = self.get_argument('blanks').split(',')
else:
blanks = []
if self.get_argument('external'):
external = self.get_argument('external').split(',')
else:
external = []
barcodes = listify(self.get_arguments('barcodes'))
blanks = listify(self.get_arguments('blanks'))
# query which surveys have been selected by the user
selected_ag_surveys = listify(
self.get_arguments('selected_ag_surveys'))
external = listify(self.get_arguments('external'))

selected_ag_surveys = list(map(int, selected_ag_surveys))

# Get metadata and create zip file
metadata, failures = db.pulldown(barcodes, blanks, external)

Expand All @@ -67,8 +75,43 @@ def post(self):
failtext = ("The following barcodes were not retrieved "
"for any survey:\n%s" % failed)
meta_zip.append("failures.txt", failtext)

# check database about what surveys are available
available_agsurveys = {}
for (_id, name, _) in db.list_ag_surveys():
available_agsurveys[_id] = name.replace(' ', '_')

results_as_pd = []
for survey, meta in viewitems(metadata):
meta_zip.append('survey_%s_md.txt' % survey, meta)
# only create files for those surveys that have been selected by
# the user. Note that ids from the DB are negative, in metadata
# they are positive!
# Currently, I (Stefan Janssen) don't have test data for external
# surveys, thus I don't know their 'survey' value. I expect it to
# be the name of the external survey. In order to not block their
# pulldown I check that a skipped survey ID must be in the set of
# all available surveys.
survey = -1 * survey
if (survey in selected_ag_surveys) or \
(survey not in available_agsurveys):
meta_zip.append('survey_%s_md.txt' %
available_agsurveys[survey], meta)
# transform each survey into a pandas dataframe for later merge
# read all columns as string to avoid unintened conversions,
# like cutting leading zeros of barcodes
pd_meta = pd.read_csv(StringIO(meta), sep="\t", dtype=str)
# reset the index to barcodes = here sample_name
pd_meta.set_index('sample_name', inplace=True)
results_as_pd.append(pd_meta)

# add the merged table of all selected surveys to the zip archive
if self.get_argument('merged', default='False') == 'True':
pd_all = pd.DataFrame()
if len(results_as_pd) > 0:
pd_all = pd.concat(results_as_pd, join='outer', axis=1)
meta_zip.append('surveys_merged_md.txt',
pd_all.to_csv(sep='\t',
index_label='sample_name'))

# write out zip file
self.add_header('Content-type', 'application/octet-stream')
Expand All @@ -92,3 +135,22 @@ def get(self):
except Exception as e:
msg = 'ERROR: %s' % str(e)
self.write(msg)


def listify(list):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list is a python function, use a different variable name (e.g. l it's ok given that the function is simple).
This function is also missing unit test.

I recommend to use some kind of text editor that has python syntax highlighting to easily identify the reserved words.

""" Returns a flat list of str for either list of str (unchanged) or a one-
element list of str - delimited by ',' - into a list of str.

Parameters
----------
list : [str]
Input list of str

Returns
-------
A list of str. If single element was a comma delimited str with x parts,
the list will contain those x elements."""
if len(list) == 1:
if ',' in list[0]:
list = list[0].split(',')
return list
24 changes: 24 additions & 0 deletions knimin/lib/data_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,6 +1685,30 @@ def add_external_survey(self, survey, description, url):
RETURNING external_survey_id"""
return self._con.execute_fetchone(sql, [survey, description, url])[0]

def list_ag_surveys(self, selected=None):
"""Returns the list of american gut survey names.

Parameters
----------
selected : list of int
The returned list's third element indicates if a survey has been
"chosen". If selected is None, all surveys will be "chosen",
otherwise only surveys whose ID is in selected are "chosen".

Returns
-------
list of (int, str, bool)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the last boolean needed? It is also confusing that the selected attribute is only used to mark this boolean - shouldn't it limit which ones are returned, rather than the user then going back over the list to filter the ones that he is not interested in? And by user I mean the developer that consumes this function.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is used to render all survey names on the metadata pulldown website. The user (who is browsing labadmin) has the option to tick a survey to include or exclude it for the actual pulldown.
Thus, this function is a hybrid of DB query and interface rendering, but since it is so simple I thought it would be OK.

first element is the group_order number
second element is the group name
third element is the information if user selected this survey for
pulldown. All surveys are selected if selected is None.
"""
sql = """SELECT group_order, american
FROM ag.survey_group
WHERE group_order < 0"""
return [(id_, name, (selected is None) or (id_ in selected))
for [id_, name] in self._con.execute_fetchall(sql)]

def list_external_surveys(self):
"""Returns list of external survey names

Expand Down
39 changes: 39 additions & 0 deletions knimin/lib/mem_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,45 @@ def write_to_buffer(self):
return self.in_memory_data.getvalue()


def extract_zip(input_zip):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this function blow up the memory usage?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for sure, if you extract a huge archive. But it need to do the extraction for unit testing and I thought it is more elegant than doing system calls to 'unzip'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok!

""" Reads all files of a zip file from disk.

A helper function to read in all files of a zip archive as strings and
return a dict of those strings where the keys are the filenames.

Parameters
----------
input_zip : str
The filename of the archive.

Returns
-------
A dict of str: keys = filenames in archive, values = content of files
"""

input_zip = zipfile.ZipFile(input_zip)
return {name: input_zip.read(name) for name in input_zip.namelist()}


def sneak_files(archive, len=1000):
""" Returns the first characters of each file in an zip archive.

Parameters
----------
archive : dict{str : filename, str : filecontents}
The already extracted zip archive in form of a dict, where keys are
filenames and values are the content of the file.
len : int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

archive is missing

Number of characters returned from each file.
Default: 1000.

Returns
-------
dict{str, str} where the first component is the filename and the second
the first <len> characters of the file."""
return map(lambda (k, v): {k: v[:len]}, archive.items())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just FYI, if the string is shorter it will return the full string ... this can cause problems in certain cases.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be fine in this case, but thanks for the info

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I now know :-)



if __name__ == "__main__":
# Run a test
imz = InMemoryZip()
Expand Down
112 changes: 63 additions & 49 deletions knimin/lib/tests/test_data_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,55 +272,54 @@ def test_get_ag_barcode_details(self):
obs = db.get_ag_barcode_details(['000018046'])
ag_login_id = '0060a301-e5bf-6a4e-e050-8a800c5d49b7'
exp = {'000018046': {
'ag_kit_barcode_id': '0060a301-e5c1-6a4e-e050-8a800c5d49b7',
'verification_email_sent': 'n',
'pass_reset_code': None,
'vioscreen_status': 3,
'sample_barcode_file': '000018046.jpg',
'environment_sampled': None,
'supplied_kit_id': db.ut_get_supplied_kit_id(ag_login_id),
'withdrawn': None,
'kit_verified': 'y',
# 'city': 'REMOVED',
'ag_kit_id': '0060a301-e5c0-6a4e-e050-8a800c5d49b7',
# 'zip': 'REMOVED',
'ag_login_id': ag_login_id,
# 'state': 'REMOVED',
'results_ready': 'Y',
'moldy': 'N',
# The key 'registered_on' is a time stamp when the database is
# created. It is unique per deployment.
# 'registered_on': datetime.datetime(2016, 8, 17, 10, 47, 2,
# 713292),
# 'kit_password': ('$2a$10$2.6Y9HmBqUFmSvKCjWmBte70WF.zd3h4Vqb'
# 'hLMQK1xP67Aj3rei86'),
# 'deposited': False,
'sample_date': datetime.date(2014, 8, 13),
# 'email': 'REMOVED',
'print_results': False,
'open_humans_token': None,
# 'elevation': 0.0,
'refunded': None,
# 'other_text': 'REMOVED',
'barcode': '000018046',
'swabs_per_kit': 1L,
# 'kit_verification_code': '60260',
# 'latitude': 0.0,
'cannot_geocode': None,
# 'address': 'REMOVED',
'date_of_last_email': datetime.date(2014, 8, 15),
'site_sampled': 'Stool',
# 'name': 'REMOVED',
'sample_time': datetime.time(11, 15),
# 'notes': 'REMOVED',
'overloaded': 'N',
# 'longitude': 0.0,
'pass_reset_time': None,
# 'country': 'REMOVED',
'survey_id': '084532330aca5885',
'other': 'N',
'sample_barcode_file_md5': None
}}
'ag_kit_barcode_id': '0060a301-e5c1-6a4e-e050-8a800c5d49b7',
'verification_email_sent': 'n',
'pass_reset_code': None,
'vioscreen_status': 3,
'sample_barcode_file': '000018046.jpg',
'environment_sampled': None,
'supplied_kit_id': db.ut_get_supplied_kit_id(ag_login_id),
'withdrawn': None,
'kit_verified': 'y',
# 'city': 'REMOVED',
'ag_kit_id': '0060a301-e5c0-6a4e-e050-8a800c5d49b7',
# 'zip': 'REMOVED',
'ag_login_id': ag_login_id,
# 'state': 'REMOVED',
'results_ready': 'Y',
'moldy': 'N',
# The key 'registered_on' is a time stamp when the database is
# created. It is unique per deployment.
# 'registered_on': datetime.datetime(2016, 8, 17, 10, 47, 2,
# 713292),
# 'kit_password': ('$2a$10$2.6Y9HmBqUFmSvKCjWmBte70WF.zd3h4Vqb'
# 'hLMQK1xP67Aj3rei86'),
# 'deposited': False,
'sample_date': datetime.date(2014, 8, 13),
# 'email': 'REMOVED',
'print_results': False,
'open_humans_token': None,
# 'elevation': 0.0,
'refunded': None,
# 'other_text': 'REMOVED',
'barcode': '000018046',
'swabs_per_kit': 1L,
# 'kit_verification_code': '60260',
# 'latitude': 0.0,
'cannot_geocode': None,
# 'address': 'REMOVED',
'date_of_last_email': datetime.date(2014, 8, 15),
'site_sampled': 'Stool',
# 'name': 'REMOVED',
'sample_time': datetime.time(11, 15),
# 'notes': 'REMOVED',
'overloaded': 'N',
# 'longitude': 0.0,
'pass_reset_time': None,
# 'country': 'REMOVED',
'survey_id': '084532330aca5885',
'other': 'N',
'sample_barcode_file_md5': None}}
participant_names = db.ut_get_participant_names_from_ag_login_id(
ag_login_id)
for key in obs:
Expand All @@ -329,6 +328,21 @@ def test_get_ag_barcode_details(self):
self.assertEqual({k: obs[key][k] for k in exp[key]}, exp[key])
self.assertIn(obs[key]['participant_name'], participant_names)

def test_list_ag_surveys(self):
truth = [(-1, 'Personal Information', True),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this test can be simplified as

self.assertItemsEqual(db.list_ag_surveys(), truth)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nope. Consider that we add surveys in the future. This would brake your test.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test would not be correct, since it will not be testing that all surveys are returned.

(-2, 'Pet Information', True),
(-3, 'Fermented Foods', True),
(-4, 'Surfers', True),
(-5, 'Personal_Microbiome', True)]
self.assertItemsEqual(db.list_ag_surveys(), truth)

truth = [(-1, 'Personal Information', False),
(-2, 'Pet Information', True),
(-3, 'Fermented Foods', False),
(-4, 'Surfers', True),
(-5, 'Personal_Microbiome', False)]
self.assertItemsEqual(db.list_ag_surveys([-2, -4]), truth)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to clarify - this is testing the real database if these surveys in the db?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's testing that this method can obtain survey names AND that the third component is set to True for all surveys if NONE is provided and only those set to True that are in a list given to the function as third argument.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do think that this is a really critical check. Now it looks like every time that a survey gets added, this test would need to be updated. What do you think about this? Do you think that it is worth while to raise an issue to make these tests - so that we don't need to update these tests every time that the surveys are updated.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that adding new surveys is a frequently occurring task. Thus, updating the test each time should be fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok that's fine - but what do you think about having some sort of documentation that contains all of the tests that need to be updated? If there is an additional survey being added - it could be a pain in the butt to hunt and find which tests need to be updated.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On a second thought, I totally agree with our argument that this is not future save. Thus, I changed the code in a way that additional surveys should not influence the outcome.


if __name__ == "__main__":
main()
30 changes: 29 additions & 1 deletion knimin/lib/tests/test_mem_zip.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import unittest
from knimin.lib.mem_zip import InMemoryZip
from knimin.lib.mem_zip import InMemoryZip, extract_zip, sneak_files
import zipfile
import os
import io
from os.path import join, dirname, realpath


class TestInMemoryZip(unittest.TestCase):
Expand Down Expand Up @@ -50,6 +51,33 @@ def test_write_to_buffer(self):
res_contents = zhandle.read(self.test_fname)
self.assertEqual(res_contents, exp_contents)

def test_extract_zip(self):
fp_zip = join(dirname(realpath(__file__)), '..', '..', 'tests', 'data',
'results_multiplesurvey_barcodes.zip')
obs = extract_zip(fp_zip)
exp_filenames = ['failures.txt', 'survey_Fermented_Foods_md.txt',
'survey_Personal_Information_md.txt',
'survey_Personal_Microbiome_md.txt',
'survey_Pet_Information_md.txt',
'surveys_merged_md.txt', 'survey_Surfers_md.txt']
# check filenames
self.assertEqual(sorted(obs.keys()), sorted(exp_filenames))
# check file contents very briefly
self.assertIn('SURF_BOARD_TYPE', obs['survey_Surfers_md.txt'])

def test_sneak_files(self):
fp_zip = join(dirname(realpath(__file__)), '..', '..', 'tests', 'data',
'results_multiplesurvey_barcodes.zip')
exp = [{'survey_Personal_Microbiome_md.txt': 'sample_name\tPM_AGE\tP'},
{'survey_Pet_Information_md.txt': 'sample_name\tALTITUDE'},
{'failures.txt': 'The following barcod'},
{'survey_Fermented_Foods_md.txt': 'sample_name\tFERMENTE'},
{'survey_Surfers_md.txt': 'sample_name\tSURF_BOA'},
{'survey_Personal_Information_md.txt': 'sample_name\tACNE_MED'},
{'surveys_merged_md.txt': 'sample_name\tACNE_MED'}]
obs = sneak_files(extract_zip(fp_zip), 20)
self.assertEqual(exp, obs)


if __name__ == '__main__':
unittest.main()
Loading