-
Notifications
You must be signed in to change notification settings - Fork 16
Pulldown: select surveys to be exported + merged table #163
Changes from 20 commits
41068be
5b0b4d4
8e7a61b
de6ade8
55bb5c1
58e61d0
e667921
36b88a6
43f32d4
1bf5fc0
a99666a
f980040
ba14da3
afabe84
a4c6729
c7f43ce
64cbb39
6401a93
903212c
7b1d57d
0d81e3a
f9e3f66
ff7aee5
ccd0eac
8051f70
003fed4
7d49672
dc55575
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
from tornado.web import authenticated | ||
from future.utils import viewitems | ||
from StringIO import StringIO | ||
import pandas as pd | ||
|
||
from knimin.handlers.base import BaseHandler | ||
from knimin import db | ||
|
@@ -13,17 +15,20 @@ class AGPulldownHandler(BaseHandler): | |
def get(self): | ||
surveys = db.list_external_surveys() | ||
self.render("ag_pulldown.html", currentuser=self.current_user, | ||
barcodes=[], surveys=surveys, errors='') | ||
barcodes=[], surveys=surveys, errors='', | ||
agsurveys=db.list_ag_surveys(), merged='False') | ||
|
||
@authenticated | ||
def post(self): | ||
# Do nothing if no file given | ||
if 'barcodes' not in self.request.files: | ||
surveys = db.list_external_surveys() | ||
ags = db.list_ag_surveys(map(int, self.get_arguments('agsurveys'))) | ||
self.render("ag_pulldown.html", currentuser=self.current_user, | ||
barcodes='', blanks='', external='', surveys=surveys, | ||
errors="No barcode file given, thus nothing could " | ||
"be pulled down.") | ||
"be pulled down.", agsurveys=ags, | ||
merged=self.get_argument('merged', default='False')) | ||
return | ||
# Get file information, ignoring commented out lines | ||
fileinfo = self.request.files['barcodes'][0]['body'] | ||
|
@@ -41,9 +46,12 @@ def post(self): | |
else: | ||
external = '' | ||
surveys = db.list_external_surveys() | ||
ags = db.list_ag_surveys(map(int, self.get_arguments('agsurveys'))) | ||
self.render("ag_pulldown.html", currentuser=self.current_user, | ||
barcodes=",".join(barcodes), blanks=",".join(blanks), | ||
surveys=surveys, external=external, errors='') | ||
surveys=surveys, external=external, errors='', | ||
agsurveys=ags, | ||
merged=self.get_argument('merged', default='False')) | ||
|
||
|
||
@set_access(['Metadata Pulldown']) | ||
|
@@ -55,10 +63,19 @@ def post(self): | |
blanks = self.get_argument('blanks').split(',') | ||
else: | ||
blanks = [] | ||
if self.get_argument('external'): | ||
|
||
# query which surveys have been selected by the user | ||
if self.get_argument('selected_ag_surveys', []): | ||
selected_ag_surveys = map(int, self.get_argument( | ||
'selected_ag_surveys').split(',')) | ||
else: | ||
selected_ag_surveys = [] | ||
|
||
if self.get_argument('external', []): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same with this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above |
||
external = self.get_argument('external').split(',') | ||
else: | ||
external = [] | ||
|
||
# Get metadata and create zip file | ||
metadata, failures = db.pulldown(barcodes, blanks, external) | ||
|
||
|
@@ -67,8 +84,43 @@ def post(self): | |
failtext = ("The following barcodes were not retrieved " | ||
"for any survey:\n%s" % failed) | ||
meta_zip.append("failures.txt", failtext) | ||
|
||
# check database about what surveys are available | ||
available_agsurveys = {} | ||
for (id, name, selected) in db.list_ag_surveys(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also catched by Antonio. Fixed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you replace id -> _id and perhaps selected by _ as it's not being used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
available_agsurveys[id] = name.replace(' ', '_') | ||
|
||
results_as_pd = [] | ||
for survey, meta in viewitems(metadata): | ||
meta_zip.append('survey_%s_md.txt' % survey, meta) | ||
# only create files for those surveys that have been selected by | ||
# the user. Note that ids from the DB are negative, in metadata | ||
# they are positive! | ||
# Currently, I (Stefan Janssen) don't have test data for external | ||
# surveys, thus I don't know their 'survey' value. I expect it to | ||
# be the name of the external survey. In order to not block their | ||
# pulldown I check that a skipped survey ID must be in the set of | ||
# all available surveys. | ||
abs_survey = abs(survey) | ||
if (abs_survey in selected_ag_surveys) or \ | ||
(abs_survey not in available_agsurveys): | ||
meta_zip.append('survey_%s_md.txt' % | ||
available_agsurveys[-1 * survey], meta) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that would be wrong, because those keys are actually negative numbers; it's what the user provided (i.e. the database) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't it then be just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I now fixed it. We cannot change the keys in available_agsurveys which are negative, but we can flip the sign for variable survey :-) |
||
# transform each survey into a pandas dataframe for later merge | ||
# read all columns as string to avoid unintened conversions, | ||
# like cutting leading zeros of barcodes | ||
pd_meta = pd.read_csv(StringIO(meta), sep="\t", dtype=str) | ||
# reset the index to barcodes = here sample_name | ||
pd_meta.set_index('sample_name', inplace=True) | ||
results_as_pd.append(pd_meta) | ||
|
||
# add the merged table of all selected surveys to the zip archive | ||
if self.get_argument('merged', default='False') == 'True': | ||
pd_all = pd.DataFrame() | ||
if len(results_as_pd) > 0: | ||
pd_all = pd.concat(results_as_pd, join='outer', axis=1) | ||
meta_zip.append('surveys_merged_md.txt', | ||
pd_all.to_csv(sep='\t', | ||
index_label='sample_name')) | ||
|
||
# write out zip file | ||
self.add_header('Content-type', 'application/octet-stream') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1685,6 +1685,30 @@ def add_external_survey(self, survey, description, url): | |
RETURNING external_survey_id""" | ||
return self._con.execute_fetchone(sql, [survey, description, url])[0] | ||
|
||
def list_ag_surveys(self, selected=None): | ||
"""Returns the list of american gut survey names. | ||
|
||
Parameters | ||
---------- | ||
selected : list of int | ||
The returned list's third element indicates if a survey has been | ||
"chosen". If selected is None, all surveys will be "chosen", | ||
otherwise only surveys whose ID is in selected are "chosen". | ||
|
||
Returns | ||
------- | ||
list of (int, str, bool) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is the last boolean needed? It is also confusing that the selected attribute is only used to mark this boolean - shouldn't it limit which ones are returned, rather than the user then going back over the list to filter the ones that he is not interested in? And by user I mean the developer that consumes this function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is used to render all survey names on the metadata pulldown website. The user (who is browsing labadmin) has the option to tick a survey to include or exclude it for the actual pulldown. |
||
first element is the group_order number | ||
second element is the group name | ||
third element is the information if user selected this survey for | ||
pulldown. All surveys are selected if selected is None. | ||
""" | ||
sql = """SELECT group_order, american | ||
FROM ag.survey_group | ||
WHERE group_order < 0""" | ||
return [(id_, name, (selected is None) or (id_ in selected)) | ||
for [id_, name] in self._con.execute_fetchall(sql)] | ||
|
||
def list_external_surveys(self): | ||
"""Returns list of external survey names | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,45 @@ def write_to_buffer(self): | |
return self.in_memory_data.getvalue() | ||
|
||
|
||
def extract_zip(input_zip): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this function blow up the memory usage? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for sure, if you extract a huge archive. But it need to do the extraction for unit testing and I thought it is more elegant than doing system calls to 'unzip' There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok! |
||
""" Reads all files of a zip file from disk. | ||
|
||
A helper function to read in all files of a zip archive as strings and | ||
return a dict of those strings where the keys are the filenames. | ||
|
||
Parameters | ||
---------- | ||
input_zip : str | ||
The filename of the archive. | ||
|
||
Returns | ||
------- | ||
A dict of str: keys = filenames in archive, values = content of files | ||
""" | ||
|
||
input_zip = zipfile.ZipFile(input_zip) | ||
return {name: input_zip.read(name) for name in input_zip.namelist()} | ||
|
||
|
||
def sneak_files(archive, len=1000): | ||
""" Returns the first characters of each file in an zip archive. | ||
|
||
Parameters | ||
---------- | ||
archive : dict{str : filename, str : filecontents} | ||
The already extracted zip archive in form of a dict, where keys are | ||
filenames and values are the content of the file. | ||
len : int | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Number of characters returned from each file. | ||
Default: 1000. | ||
|
||
Returns | ||
------- | ||
dict{str, str} where the first component is the filename and the second | ||
the first <len> characters of the file.""" | ||
return map(lambda (k, v): {k: v[:len]}, archive.items()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just FYI, if the string is shorter it will return the full string ... this can cause problems in certain cases. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be fine in this case, but thanks for the info There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I now know :-) |
||
|
||
|
||
if __name__ == "__main__": | ||
# Run a test | ||
imz = InMemoryZip() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -272,55 +272,54 @@ def test_get_ag_barcode_details(self): | |
obs = db.get_ag_barcode_details(['000018046']) | ||
ag_login_id = '0060a301-e5bf-6a4e-e050-8a800c5d49b7' | ||
exp = {'000018046': { | ||
'ag_kit_barcode_id': '0060a301-e5c1-6a4e-e050-8a800c5d49b7', | ||
'verification_email_sent': 'n', | ||
'pass_reset_code': None, | ||
'vioscreen_status': 3, | ||
'sample_barcode_file': '000018046.jpg', | ||
'environment_sampled': None, | ||
'supplied_kit_id': db.ut_get_supplied_kit_id(ag_login_id), | ||
'withdrawn': None, | ||
'kit_verified': 'y', | ||
# 'city': 'REMOVED', | ||
'ag_kit_id': '0060a301-e5c0-6a4e-e050-8a800c5d49b7', | ||
# 'zip': 'REMOVED', | ||
'ag_login_id': ag_login_id, | ||
# 'state': 'REMOVED', | ||
'results_ready': 'Y', | ||
'moldy': 'N', | ||
# The key 'registered_on' is a time stamp when the database is | ||
# created. It is unique per deployment. | ||
# 'registered_on': datetime.datetime(2016, 8, 17, 10, 47, 2, | ||
# 713292), | ||
# 'kit_password': ('$2a$10$2.6Y9HmBqUFmSvKCjWmBte70WF.zd3h4Vqb' | ||
# 'hLMQK1xP67Aj3rei86'), | ||
# 'deposited': False, | ||
'sample_date': datetime.date(2014, 8, 13), | ||
# 'email': 'REMOVED', | ||
'print_results': False, | ||
'open_humans_token': None, | ||
# 'elevation': 0.0, | ||
'refunded': None, | ||
# 'other_text': 'REMOVED', | ||
'barcode': '000018046', | ||
'swabs_per_kit': 1L, | ||
# 'kit_verification_code': '60260', | ||
# 'latitude': 0.0, | ||
'cannot_geocode': None, | ||
# 'address': 'REMOVED', | ||
'date_of_last_email': datetime.date(2014, 8, 15), | ||
'site_sampled': 'Stool', | ||
# 'name': 'REMOVED', | ||
'sample_time': datetime.time(11, 15), | ||
# 'notes': 'REMOVED', | ||
'overloaded': 'N', | ||
# 'longitude': 0.0, | ||
'pass_reset_time': None, | ||
# 'country': 'REMOVED', | ||
'survey_id': '084532330aca5885', | ||
'other': 'N', | ||
'sample_barcode_file_md5': None | ||
}} | ||
'ag_kit_barcode_id': '0060a301-e5c1-6a4e-e050-8a800c5d49b7', | ||
'verification_email_sent': 'n', | ||
'pass_reset_code': None, | ||
'vioscreen_status': 3, | ||
'sample_barcode_file': '000018046.jpg', | ||
'environment_sampled': None, | ||
'supplied_kit_id': db.ut_get_supplied_kit_id(ag_login_id), | ||
'withdrawn': None, | ||
'kit_verified': 'y', | ||
# 'city': 'REMOVED', | ||
'ag_kit_id': '0060a301-e5c0-6a4e-e050-8a800c5d49b7', | ||
# 'zip': 'REMOVED', | ||
'ag_login_id': ag_login_id, | ||
# 'state': 'REMOVED', | ||
'results_ready': 'Y', | ||
'moldy': 'N', | ||
# The key 'registered_on' is a time stamp when the database is | ||
# created. It is unique per deployment. | ||
# 'registered_on': datetime.datetime(2016, 8, 17, 10, 47, 2, | ||
# 713292), | ||
# 'kit_password': ('$2a$10$2.6Y9HmBqUFmSvKCjWmBte70WF.zd3h4Vqb' | ||
# 'hLMQK1xP67Aj3rei86'), | ||
# 'deposited': False, | ||
'sample_date': datetime.date(2014, 8, 13), | ||
# 'email': 'REMOVED', | ||
'print_results': False, | ||
'open_humans_token': None, | ||
# 'elevation': 0.0, | ||
'refunded': None, | ||
# 'other_text': 'REMOVED', | ||
'barcode': '000018046', | ||
'swabs_per_kit': 1L, | ||
# 'kit_verification_code': '60260', | ||
# 'latitude': 0.0, | ||
'cannot_geocode': None, | ||
# 'address': 'REMOVED', | ||
'date_of_last_email': datetime.date(2014, 8, 15), | ||
'site_sampled': 'Stool', | ||
# 'name': 'REMOVED', | ||
'sample_time': datetime.time(11, 15), | ||
# 'notes': 'REMOVED', | ||
'overloaded': 'N', | ||
# 'longitude': 0.0, | ||
'pass_reset_time': None, | ||
# 'country': 'REMOVED', | ||
'survey_id': '084532330aca5885', | ||
'other': 'N', | ||
'sample_barcode_file_md5': None}} | ||
participant_names = db.ut_get_participant_names_from_ag_login_id( | ||
ag_login_id) | ||
for key in obs: | ||
|
@@ -329,6 +328,23 @@ def test_get_ag_barcode_details(self): | |
self.assertEqual({k: obs[key][k] for k in exp[key]}, exp[key]) | ||
self.assertIn(obs[key]['participant_name'], participant_names) | ||
|
||
def test_list_ag_surveys(self): | ||
truth = [(-1, 'Personal Information', True), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this test can be simplified as self.assertItemsEqual(db.list_ag_surveys(), truth) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nope. Consider that we add surveys in the future. This would brake your test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test would not be correct, since it will not be testing that all surveys are returned. |
||
(-2, 'Pet Information', True), | ||
(-3, 'Fermented Foods', True), | ||
(-4, 'Surfers', True), | ||
(-5, 'Personal_Microbiome', True)] | ||
for survey in truth: | ||
self.assertIn(survey, db.list_ag_surveys()) | ||
|
||
truth = [(-1, 'Personal Information', False), | ||
(-2, 'Pet Information', True), | ||
(-3, 'Fermented Foods', False), | ||
(-4, 'Surfers', True), | ||
(-5, 'Personal_Microbiome', False)] | ||
for survey in truth: | ||
self.assertIn(survey, db.list_ag_surveys([-2, -4])) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to clarify - this is testing the real database if these surveys in the db? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's testing that this method can obtain survey names AND that the third component is set to True for all surveys if NONE is provided and only those set to True that are in a list given to the function as third argument. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do think that this is a really critical check. Now it looks like every time that a survey gets added, this test would need to be updated. What do you think about this? Do you think that it is worth while to raise an issue to make these tests - so that we don't need to update these tests every time that the surveys are updated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think that adding new surveys is a frequently occurring task. Thus, updating the test each time should be fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok that's fine - but what do you think about having some sort of documentation that contains all of the tests that need to be updated? If there is an additional survey being added - it could be a pain in the butt to hunt and find which tests need to be updated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On a second thought, I totally agree with our argument that this is not future save. Thus, I changed the code in a way that additional surveys should not influence the outcome. |
||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this if really necessary? you could do:
selected_ag_surveys = map(int, self.get_argument('selected_ag_surveys', []).split(','))
right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I figured that it can be even easier than that, if we use get_aguments (not the s). Then, you can provide a list of strings as parameters.