Skip to content

Commit cb88385

Browse files
authored
Merge pull request #60 from usegalaxy-eu/remote-check
Automatic check if an object already exists on ENA
2 parents 5d27dce + 0c74f43 commit cb88385

File tree

5 files changed

+114
-45
lines changed

5 files changed

+114
-45
lines changed

README.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,17 @@ All supported arguments:
6060
--experiment EXPERIMENT
6161
table of EXPERIMENT object
6262
--run RUN table of RUN object
63-
--data [FILE ...] data for submission
63+
--data [FILE [FILE ...]]
64+
data for submission
6465
--center CENTER_NAME specific to your Webin account
6566
--checklist CHECKLIST
6667
specify the sample checklist with following pattern: ERC0000XX, Default: ERC000011
67-
--xlsx XLSX Excel table with metadata
68+
--xlsx XLSX filled in excel template with metadata
69+
--auto_action BETA: detect automatically which action (add or modify) to apply when the action column is not given
6870
--tool TOOL_NAME specify the name of the tool this submission is done with. Default: ena-upload-cli
6971
--tool_version TOOL_VERSION
7072
specify the version of the tool this submission is done with
71-
--no_data_upload indicate if no upload should be performed and you like to submit a RUN object (e.g. if uploaded
72-
was done separately).
73+
--no_data_upload indicate if no upload should be performed and you like to submit a RUN object (e.g. if uploaded was done separately).
7374
--draft indicate if no submission should be performed
7475
--secret SECRET .secret.yml file containing the password and Webin ID of your ENA account
7576
-d, --dev flag to use the dev/sandbox endpoint of ENA
@@ -161,7 +162,10 @@ Use the *--dev* flag if you want to do a test submission using the tool by the s
161162

162163
### Submitting a selection of rows to ENA
163164

164-
Optionally you can add a status column to every table that contains the action you want to apply during this submission. If you chose to add only the first 2 samples to ENA, you specify `--action add` as parameter in the command and you add the `add` value to the status column of the rows you want to submit as demonstrated below. Same holds for the action `modify`, `release` and `cancel`.
165+
There are two ways of submitting only a selection of objects to ENA. This is handy for reoccurring submissions, especially when they belong to the same study.
166+
167+
- Manual: you can add an optional `status` column to every table/sheet that contains the action you want to apply during this submission. If you chose to add only the first 2 samples to ENA, you specify `--action add` as parameter in the command and you add the `add` value to the status column of the rows you want to submit as demonstrated below. Same holds for the action `modify`, `release` and `cancel`.
168+
- Automatic (BETA): using the `--auto_action` it is possible to auto detect wether an object (using the alias) is already present on ENA and will fill in the specified action (`--action` parameter) accordingly. In practice, this means that if a user chooses to add objects and we already find this object already exists using its alias, this objects will not be added. On the other hand, if the command is used to modify objects, we want to apply this solely on objects that already exist on ENA. The detection only works with ENA objects that are published and findable on the website trough the search function (both the dev and live website). If the tool does not correctly detect the presence of your ENA object, we suggest to use the more robust manual approach as described above.
165169

166170
**Example with modify as seen in the [example sample modify table](example_tables/ENA_template_samples_modify.tsv)**
167171

ena_upload/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.1"
1+
__version__ = "0.5.2"

ena_upload/check_remote.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import json
2+
import requests
3+
4+
URL = "https://www.ebi.ac.uk/ena/portal/api/search"
5+
DEV_URL = "https://wwwdev.ebi.ac.uk/ena/portal/api/search"
6+
7+
def identify_action(entry_type, alias, dev):
8+
''' define action ['add' | 'modify'] that needs to be performed for this entry '''
9+
query = {entry_type + '_alias': alias}
10+
remote_accessions = check_remote_entry(entry_type, query, dev)
11+
if isinstance(remote_accessions, list) and len(remote_accessions) > 0:
12+
print(f'\tFound: {entry_type} entry with alias {alias}')
13+
return True
14+
else:
15+
print(f'\tNo {entry_type} entry found with alias {alias}')
16+
return False
17+
18+
19+
def check_remote_entry(entry_type, query_dict, dev):
20+
'''
21+
Checks if an entry with that alias exists in the ENA repos
22+
entry_type = [study | sample | experiment | run]
23+
'''
24+
assert entry_type in ['study', 'sample', 'experiment', 'run']
25+
params_dict = {}
26+
query_str = ' AND '.join(['%s="%s"' % (key, value) for (key, value) in query_dict.items()])
27+
params_dict['query'] = query_str
28+
params_dict['result'] = 'read_' + entry_type
29+
params_dict['fields'] = entry_type + '_alias'
30+
params_dict['format'] = 'json'
31+
if dev:
32+
response = requests.post(DEV_URL, data=params_dict)
33+
else:
34+
response = requests.post(URL, data=params_dict)
35+
if response.content != b'':
36+
return json.loads(response.content)
37+
return []

ena_upload/ena_upload.py

Lines changed: 66 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@
2020
import pandas as pd
2121
import tempfile
2222
from ena_upload._version import __version__
23+
from ena_upload.check_remote import identify_action
2324

2425
SCHEMA_TYPES = ['study', 'experiment', 'run', 'sample']
2526

27+
STATUS_CHANGES = {'ADD': 'ADDED', 'MODIFY': 'MODIFIED',
28+
'CANCEL': 'CANCELLED', 'RELEASE': 'RELEASED'}
2629

2730
class MyFTP_TLS(ftplib.FTP_TLS):
2831
"""Explicit FTPS, with shared TLS session"""
@@ -36,7 +39,7 @@ def ntransfercmd(self, cmd, rest=None):
3639
return conn, size
3740

3841

39-
def create_dataframe(schema_tables, action):
42+
def create_dataframe(schema_tables, action, dev, auto_action):
4043
'''create pandas dataframe from the tables in schema_tables
4144
and return schema_dataframe
4245
@@ -54,7 +57,7 @@ def create_dataframe(schema_tables, action):
5457
for schema, table in schema_tables.items():
5558
df = pd.read_csv(table, sep='\t', comment='#', dtype=str)
5659
df = df.dropna(how='all')
57-
df = check_columns(df, schema, action)
60+
df = check_columns(df, schema, action, dev, auto_action)
5861
schema_dataframe[schema] = df
5962

6063
return schema_dataframe
@@ -80,7 +83,7 @@ def extract_targets(action, schema_dataframe):
8083
return schema_targets
8184

8285

83-
def check_columns(df, schema, action):
86+
def check_columns(df, schema, action, dev, auto_action):
8487
# checking for optional columns and if not present, adding them
8588
if schema == 'sample':
8689
optional_columns = ['accession', 'submission_date',
@@ -94,10 +97,32 @@ def check_columns(df, schema, action):
9497
for header in optional_columns:
9598
if not header in df.columns:
9699
if header == 'status':
97-
# status column contain action keywords
98-
# for xml rendering, keywords require uppercase
99-
# according to scheme definition of submission
100-
df[header] = str(action).upper()
100+
if auto_action:
101+
for index, row in df.iterrows():
102+
remote_present = np.nan
103+
try:
104+
remote_present = str(identify_action(
105+
schema, str(df['alias'][index]), dev)).upper()
106+
107+
except Exception as e:
108+
print(e)
109+
print(
110+
f"Something went wrong with detecting the ENA object {df['alias'][index]} on the servers of ENA. This object will be skipped.")
111+
if remote_present == np.nan:
112+
df.at[index, header] = np.nan
113+
elif remote_present and action == 'MODIFY':
114+
df.at[index, header] = action
115+
print(
116+
f"\t'{df['alias'][index]}' gets '{remote_present}' as action in the status column")
117+
elif not remote_present and action in ['ADD', 'CANCEL', 'RELEASE']:
118+
df.at[index, header] = action
119+
print(
120+
f"\t'{df['alias'][index]}' gets '{remote_present}' as action in the status column")
121+
else:
122+
# status column contain action keywords
123+
# for xml rendering, keywords require uppercase
124+
# according to scheme definition of submission
125+
df[header] = str(action).upper()
101126
else:
102127
df[header] = np.nan
103128
else:
@@ -106,6 +131,7 @@ def check_columns(df, schema, action):
106131

107132
return df
108133

134+
109135
def check_filenames(file_paths, run_df):
110136
"""Compare data filenames from command line and from RUN table.
111137
@@ -462,16 +488,12 @@ def process_receipt(receipt, action):
462488
errors = '\nOops:\n' + '\n'.join(errors)
463489
sys.exit(errors)
464490

465-
# define expected status based on action
466-
status = {'ADD': 'added', 'MODIFY': 'modified',
467-
'CANCEL': 'cancelled', 'RELEASE': 'released'}
468-
469491
def make_update(update, ena_type):
470492
update_list = []
471493
print(f"\n{ena_type.capitalize()} accession details:")
472494
for element in update:
473495
extract = (element.get('alias'), element.get(
474-
'accession'), receiptDate, status[action])
496+
'accession'), receiptDate, STATUS_CHANGES[action])
475497
print("\t".join(extract))
476498
update_list.append(extract)
477499
# used for labelling dataframe
@@ -515,7 +537,7 @@ def make_update(update, ena_type):
515537
print(f"\n{ena_type.capitalize()} accession details:")
516538
update_list = []
517539
for accession in accessions:
518-
extract = (accession, receiptDate, status[action])
540+
extract = (accession, receiptDate, STATUS_CHANGES[action])
519541
update_list.append(extract)
520542
print("\t".join(extract))
521543

@@ -587,9 +609,6 @@ def update_table_simple(schema_dataframe, schema_targets, action):
587609
:return schema_dataframe: a dictionary - {schema:dataframe}
588610
dataframe -- updated status
589611
"""
590-
# define expected status based on action
591-
status = {'ADD': 'added', 'MODIFY': 'modified',
592-
'CANCEL': 'cancelled', 'RELEASE': 'released'}
593612

594613
for schema in schema_targets.keys():
595614
dataframe = schema_dataframe[schema]
@@ -599,7 +618,7 @@ def update_table_simple(schema_dataframe, schema_targets, action):
599618
targets.set_index('alias', inplace=True)
600619

601620
for index in targets.index:
602-
dataframe.loc[index, 'status'] = status[action]
621+
dataframe.loc[index, 'status'] = STATUS_CHANGES[action]
603622

604623
return schema_dataframe
605624

@@ -687,10 +706,15 @@ def process_args():
687706

688707
parser.add_argument('--checklist', help="specify the sample checklist with following pattern: ERC0000XX, Default: ERC000011", dest='checklist',
689708
default='ERC000011')
690-
709+
691710
parser.add_argument('--xlsx',
692-
help='excel table with metadata')
693-
711+
help='filled in excel template with metadata')
712+
713+
parser.add_argument('--auto_action',
714+
action="store_true",
715+
default=False,
716+
help='BETA: detect automatically which action (add or modify) to apply when the action column is not given')
717+
694718
parser.add_argument('--tool',
695719
dest='tool_name',
696720
default='ena-upload-cli',
@@ -730,15 +754,15 @@ def process_args():
730754
if not os.path.isfile(args.secret):
731755
msg = f"Oops, the file {args.secret} does not exist"
732756
parser.error(msg)
733-
757+
734758
# check if xlsx file exists
735759
if args.xlsx:
736760
if not os.path.isfile(args.xlsx):
737761
msg = f"Oops, the file {args.xlsx} does not exist"
738762
parser.error(msg)
739763

740764
# check if data is given when adding a 'run' table
741-
if (not args.no_data_upload and args.run and args.action.upper() not in ['RELEASE','CANCEL']) or (not args.no_data_upload and args.xlsx and args.action.upper() not in ['RELEASE','CANCEL']):
765+
if (not args.no_data_upload and args.run and args.action.upper() not in ['RELEASE', 'CANCEL']) or (not args.no_data_upload and args.xlsx and args.action.upper() not in ['RELEASE', 'CANCEL']):
742766
if args.data is None:
743767
parser.error('Oops, requires data for submitting RUN object')
744768

@@ -767,6 +791,7 @@ def collect_tables(args):
767791

768792
return schema_tables
769793

794+
770795
def update_date(date):
771796
if pd.isnull(date) or isinstance(date, str):
772797
return date
@@ -788,6 +813,7 @@ def main():
788813
secret = args.secret
789814
draft = args.draft
790815
xlsx = args.xlsx
816+
auto_action = args.auto_action
791817

792818
with open(secret, 'r') as secret_file:
793819
credentials = yaml.load(secret_file, Loader=yaml.FullLoader)
@@ -812,16 +838,19 @@ def main():
812838
elif f"ENA_{schema}" in xl_workbook.book.sheetnames:
813839
xl_sheet = xl_workbook.parse(f"ENA_{schema}", header=0)
814840
else:
815-
sys.exit(f"The sheet '{schema}' is not present in the excel sheet {xlsx}")
841+
sys.exit(
842+
f"The sheet '{schema}' is not present in the excel sheet {xlsx}")
816843
xl_sheet = xl_sheet.drop(0).dropna(how='all')
817844
for column_name in list(xl_sheet.columns.values):
818845
if 'date' in column_name:
819-
xl_sheet[column_name] = xl_sheet[column_name].apply(update_date)
846+
xl_sheet[column_name] = xl_sheet[column_name].apply(
847+
update_date)
820848

821849
if True in xl_sheet.columns.duplicated():
822850
sys.exit("Duplicated columns found")
823851

824-
xl_sheet = check_columns(xl_sheet, schema, action)
852+
xl_sheet = check_columns(
853+
xl_sheet, schema, action, dev, auto_action)
825854
schema_dataframe[schema] = xl_sheet
826855
path = os.path.dirname(os.path.abspath(xlsx))
827856
schema_tables[schema] = f"{path}/ENA_template_{schema}.tsv"
@@ -830,7 +859,8 @@ def main():
830859
schema_tables = collect_tables(args)
831860

832861
# create dataframe from table
833-
schema_dataframe = create_dataframe(schema_tables, action)
862+
schema_dataframe = create_dataframe(
863+
schema_tables, action, dev, auto_action)
834864

835865
# ? add a function to sanitize characters
836866
# ? print 'validate table for specific action'
@@ -854,11 +884,11 @@ def main():
854884
file_paths = {}
855885
if args.data:
856886
for path in args.data:
857-
file_paths[os.path.basename(path)] = os.path.abspath(path)
887+
file_paths[os.path.basename(path)] = os.path.abspath(path)
858888
# check if file names identical between command line and table
859889
# if not, system exits
860890
check_filenames(file_paths, df)
861-
891+
862892
# generate MD5 sum if not supplied in table
863893
if file_paths and not check_file_checksum(df):
864894
print("No valid checksums found, generate now...", end=" ")
@@ -953,18 +983,16 @@ def main():
953983
print("There was an ERROR during submission:")
954984
sys.exit(receipt)
955985

956-
if action in ['ADD', 'MODIFY']:
957-
schema_dataframe = update_table(schema_dataframe,
986+
if action in ['ADD', 'MODIFY'] and not draft:
987+
schema_dataframe = update_table(schema_dataframe,
958988
schema_targets,
959989
schema_update)
960-
# save updates in new tables
961-
save_update(schema_tables, schema_dataframe)
962-
elif action in ['CANCEL', 'RELEASE']:
963-
schema_dataframe = update_table_simple(schema_dataframe,
964-
schema_targets,
965-
action)
966-
# save updates in new tables
967-
save_update(schema_tables, schema_dataframe)
990+
else:
991+
schema_dataframe = update_table_simple(schema_dataframe,
992+
schema_targets,
993+
action)
994+
# save updates in new tables
995+
save_update(schema_tables, schema_dataframe)
968996

969997

970998
if __name__ == "__main__":

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from setuptools import setup, find_packages
1+
from setuptools import setup
22
from ena_upload._version import __version__
33

44
with open("README.md", 'r') as f:

0 commit comments

Comments
 (0)