Skip to content

Commit 08a59e2

Browse files
fix random choices taxonomy
1 parent 4245ca0 commit 08a59e2

File tree

2 files changed

+60
-45
lines changed

2 files changed

+60
-45
lines changed

code/BacDup/modules/input_parser.py

Lines changed: 49 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,16 @@
1616
import argparse
1717
import time
1818
from Bio import SeqIO
19-
import HCGB
20-
from HCGB.functions.aesthetics_functions import debug_message
21-
import HCGB.functions.time_functions as time_functions
2219
from termcolor import colored
2320
import pandas as pd
2421

22+
## import HCGB
23+
from HCGB.functions.aesthetics_functions import debug_message
24+
import HCGB.functions.time_functions as HCGB_time
25+
import HCGB.functions.files_functions as HCGB_files
26+
import HCGB.functions.aesthetics_functions as HCGB_aes
27+
import HCGB.functions.main_functions as HCGB_main
28+
2529
## my modules
2630
import BacDup
2731
import BacDup.scripts.gbf_parser as gbf_parser
@@ -47,9 +51,9 @@ def run_input(arg_dict):
4751
exit()
4852

4953
BacDup_functions.pipeline_header('BacDup')
50-
HCGB.functions.aesthetics_functions.boxymcboxface("Preparing input files")
54+
HCGB_aes.boxymcboxface("Preparing input files")
5155
print ("--------- Starting Process ---------")
52-
time_functions.print_time()
56+
HCGB_time.print_time()
5357

5458
## init time
5559
start_time_total = time.time()
@@ -60,7 +64,7 @@ def run_input(arg_dict):
6064

6165
## output folder
6266
print ("\n+ Create output folder(s):")
63-
HCGB.functions.files_functions.create_folder(outdir)
67+
HCGB_files.create_folder(outdir)
6468

6569
## set defaults
6670
if not (arg_dict.assembly_level):
@@ -76,7 +80,7 @@ def run_input(arg_dict):
7680
else:
7781
arg_dict.project = True
7882
print ("+ Generate a directory containing information within the project folder provided")
79-
final_dir = HCGB.functions.files_functions.create_subfolder("info", outdir)
83+
final_dir = HCGB_files.create_subfolder("info", outdir)
8084

8185
## debug messages
8286
if (arg_dict.debug):
@@ -90,7 +94,7 @@ def run_input(arg_dict):
9094

9195
## get files
9296
print ()
93-
HCGB.functions.aesthetics_functions.print_sepLine("-",50, False)
97+
HCGB_aes.print_sepLine("-",50, False)
9498
print ('+ Getting input information provided... ')
9599
print ('+ Several options available:')
96100
print ('\t* Single/Multiple Annotation file:')
@@ -104,7 +108,7 @@ def run_input(arg_dict):
104108
time.sleep(1)
105109

106110
## time stamp
107-
start_time_partial = time_functions.timestamp(start_time_total)
111+
start_time_partial = HCGB_time.timestamp(start_time_total)
108112

109113
#################################################
110114
## Parse and obtain the type of input information provided
@@ -116,24 +120,24 @@ def run_input(arg_dict):
116120
## 'plasmids_number','plasmids_ID'))
117121

118122
## time stamp
119-
start_time_partial = time_functions.timestamp(start_time_partial)
123+
start_time_partial = HCGB_time.timestamp(start_time_partial)
120124

121125
## parse information accordingly
122126
parse_information(arg_dict, df_accID, outdir)
123127

124128
### report generation
125-
HCGB.functions.aesthetics_functions.boxymcboxface("Summarizing input files")
126-
outdir_report = HCGB.functions.files_functions.create_subfolder("report", outdir)
129+
HCGB_aes.boxymcboxface("Summarizing input files")
130+
outdir_report = HCGB_files.create_subfolder("report", outdir)
127131

128-
input_report = HCGB.functions.files_functions.create_subfolder("input", outdir_report)
132+
input_report = HCGB_files.create_subfolder("input", outdir_report)
129133

130134
## add df_accID.loc[sample,] information as csv into input folder
131135
df_accID.to_csv(os.path.join(input_report, 'info.csv'), index=True, header=True)
132136

133137
## maybe add a summary of the files?
134138

135139
print ("\n*************** Finish *******************")
136-
start_time_partial = time_functions.timestamp(start_time_total)
140+
start_time_partial = HCGB_time.timestamp(start_time_total)
137141

138142
print ("+ Exiting Input module.")
139143
return()
@@ -142,8 +146,8 @@ def run_input(arg_dict):
142146
def parse_information(arg_dict, df_accID, outdir):
143147

144148
### Parse df_accID
145-
dict_input_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug)
146-
dict_parse_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug)
149+
dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug)
150+
dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug)
147151

148152
## debug messages
149153
if (arg_dict.debug):
@@ -170,34 +174,34 @@ def parse_information(arg_dict, df_accID, outdir):
170174
print()
171175
print ("\t+ Parsing sample: " + sample)
172176

173-
if (not HCGB.functions.files_functions.is_non_zero_file(parse_timestamp) and not HCGB.functions.files_functions.is_non_zero_file(input_timestamp)):
177+
if (not HCGB_files.is_non_zero_file(parse_timestamp) and not HCGB_files.is_non_zero_file(input_timestamp)):
174178

175179
## TODO: Set threads to use in parallel
176180
process_OK = parse_annot_file(sample, folder_input, df_accID.loc[sample, 'annot_file'], dict_parse_folders[sample], arg_dict.debug, df_accID.loc[sample, 'genome'])
177181

178182
if (process_OK):
179183

180184
## link or copy annotation file into folder_input
181-
HCGB.functions.files_functions.get_symbolic_link_file(df_accID.loc[sample, 'annot_file'], folder_input)
185+
HCGB_files.get_symbolic_link_file(df_accID.loc[sample, 'annot_file'], folder_input)
182186

183187
## add df_accID.loc[sample,] information as csv into input folder
184188
df_accID.loc[sample,].to_csv(os.path.join(folder_input, 'info.csv'), index=True, header=True)
185189

186190
## print time stamp
187-
time_functions.print_time_stamp(input_timestamp)
191+
HCGB_time.print_time_stamp(input_timestamp)
188192

189193
## print time stamp
190-
time_functions.print_time_stamp(parse_timestamp)
194+
HCGB_time.print_time_stamp(parse_timestamp)
191195
else:
192196
print(colored("\t+ Some error occurred for sample %s while parsing input options" %sample, 'red'))
193197

194198
## print time stamp
195-
time_functions.print_time_stamp(os.path.join(folder_input, '.fail'))
199+
HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail'))
196200

197201
## print time stamp
198-
time_functions.print_time_stamp(os.path.join(dict_parse_folders[sample], '.fail'))
202+
HCGB_time.print_time_stamp(os.path.join(dict_parse_folders[sample], '.fail'))
199203
else:
200-
read_time = time_functions.read_time_stamp(parse_timestamp)
204+
read_time = HCGB_time.read_time_stamp(parse_timestamp)
201205
print (colored("\t+ Input parsing already available for sample %s [%s]" %(sample, read_time), 'green'))
202206
print()
203207

@@ -250,7 +254,7 @@ def parse_annot_file(name, folder_out_input, annot_file, output_path, Debug, ref
250254

251255
elif(format=='gff'):
252256
print (colored('\t* GFF format file:.......[OK]', 'green'))
253-
if (HCGB.functions.files_functions.is_non_zero_file(ref_file)):
257+
if (HCGB_files.is_non_zero_file(ref_file)):
254258
return(gff_parser.gff_parser_caller(annot_file, ref_file, output_path, Debug))
255259
else:
256260
print(colored("ERROR: No genome reference file provided for this GFF annotation. Check input options provided.","red"))
@@ -293,7 +297,7 @@ def parse_options(arg_dict):
293297
BacDup_functions.file_readable_check(arg_dict.annot_file)
294298

295299
print (colored('\t* Multiple annotation files provided .......[OK]', 'green'))
296-
dict_entries = HCGB.functions.main_functions.file2dictionary(arg_dict.annot_file, ',')
300+
dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')
297301

298302
## debug messages
299303
if (arg_dict.debug):
@@ -361,7 +365,7 @@ def parse_options(arg_dict):
361365
BacDup_functions.file_readable_check(arg_dict.ref_file)
362366

363367
if (arg_dict.batch):
364-
ref_entries = HCGB.functions.main_functions.file2dictionary(arg_dict.ref_file, ',')
368+
ref_entries = HCGB_main.file2dictionary(arg_dict.ref_file, ',')
365369
genome = ref_entries[name]
366370
else:
367371
genome = arg_dict.ref_file
@@ -382,9 +386,9 @@ def parse_options(arg_dict):
382386
elif (arg_dict.GenBank_id):
383387
## get database path
384388
if (arg_dict.db_folder):
385-
db_folder = HCGB.functions.files_functions.create_folder(os.path.abspath(arg_dict.db_folder))
389+
db_folder = HCGB_files.create_folder(os.path.abspath(arg_dict.db_folder))
386390
else:
387-
db_folder = HCGB.functions.files_functions.create_subfolder("db", os.path.abspath(arg_dict.output_folder))
391+
db_folder = HCGB_files.create_subfolder("db", os.path.abspath(arg_dict.output_folder))
388392

389393
## debug messages
390394
if (arg_dict.debug):
@@ -410,7 +414,7 @@ def parse_options(arg_dict):
410414
print()
411415

412416
## call IDs into a list and create tmp folder
413-
strains2get = HCGB.functions.main_functions.readList_fromFile(arg_dict.GenBank_id)
417+
strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
414418
strains2get = list(filter(None, strains2get))
415419

416420
## debug messages
@@ -435,7 +439,7 @@ def parse_options(arg_dict):
435439
## download
436440
print (colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
437441
print()
438-
HCGB.functions.aesthetics_functions.print_sepLine("+", 75, False)
442+
HCGB_aes.print_sepLine("+", 75, False)
439443
df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(arg_dict.GenBank_id, db_folder, arg_dict.debug)
440444

441445
## --------------------------------------- ##
@@ -457,7 +461,7 @@ def parse_options(arg_dict):
457461
BacDup_functions.file_readable_check(arg_dict.tax_id)
458462

459463
## get IDs into a list
460-
taxIDs2get = HCGB.functions.main_functions.readList_fromFile(arg_dict.tax_id)
464+
taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)
461465

462466
else:
463467
print (colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
@@ -496,9 +500,9 @@ def parse_options(arg_dict):
496500
## get database path
497501
#################
498502
if (arg_dict.db_folder):
499-
db_folder = HCGB.functions.files_functions.create_folder(os.path.abspath(arg_dict.db_folder))
503+
db_folder = HCGB_files.create_folder(os.path.abspath(arg_dict.db_folder))
500504
else:
501-
db_folder = HCGB.functions.files_functions.create_subfolder("db", outdir)
505+
db_folder = HCGB_files.create_subfolder("db", outdir)
502506

503507
## debug messages
504508
if arg_dict.debug:
@@ -518,20 +522,25 @@ def parse_options(arg_dict):
518522

519523
## print list and dictionary of possible and selected taxIDs
520524
outdir = os.path.abspath(arg_dict.output_folder)
521-
final_dir = HCGB.functions.files_functions.create_subfolder("info", outdir)
522-
input_info_dir = HCGB.functions.files_functions.create_subfolder("input", outdir)
523-
HCGB.functions.main_functions.printList2file(os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
524-
HCGB.functions.main_functions.printList2file(os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available)
525+
info_dir = HCGB_files.create_subfolder("info", outdir)
526+
input_info_dir = HCGB_files.create_subfolder("input", info_dir)
527+
HCGB_main.printList2file(os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
528+
HCGB_main.printList2file(os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available)
525529

526530
## save into file
527531
file_info = os.path.join(input_info_dir, 'info.txt')
528532

529533
## stop here if dry_run
530534
if arg_dict.dry_run:
535+
print()
536+
HCGB_aes.print_sepLine("*", 75, False)
531537
print ("ATTENTION: Dry run mode selected. Stopping the process here.")
532-
print("All available entries listed and printed in file: "+ os.path.join(input_info_dir, 'all_entries.txt'))
533-
print("Subset of entries generated and printed in file: "+ os.path.join(input_info_dir, 'Downloaded.txt'))
534-
print ("\n\nIf random numbers selected, take into account re-running this process might produce different results.")
538+
HCGB_aes.print_sepLine("*", 75, False)
539+
print("+ All available entries listed and printed in file:\n\t"+ os.path.join(input_info_dir, 'all_entries.txt'))
540+
print("+ Subset of entries generated and printed in file:\n\t"+ os.path.join(input_info_dir, 'Downloaded.txt'))
541+
print ("\n\nIf random numbers selected, take into account re-running this process might produce different results.\n")
542+
HCGB_aes.print_sepLine("*", 75, False)
543+
print()
535544
exit()
536545

537546
#################

code/BacDup/scripts/taxonomy_retrieval.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,18 @@ def get_GenBank_ids(data_folder, taxID_list, random_k, debug, assembly_level_giv
131131
##
132132
if random_k<0:
133133
list_entries = list(dict_entries.keys())
134-
print ('All %s entries selected' %dict_entries_len)
135-
134+
print ('+ All %s entries selected' %dict_entries_len)
136135
else:
137-
print ("Selecting random entries retrieved:")
138-
list_entries = random.choices(list(dict_entries.keys()), k=random_k)
139-
print ('%s entries selected out of %s' %(str(random_k), str(dict_entries_len)))
136+
if random_k > dict_entries_len:
137+
print("+ Sample size desired larger than population.")
138+
list_entries = list(dict_entries.keys())
139+
print ('\tOnly %s entries selected out of %s specified' %(dict_entries_len, random_k))
140+
else:
141+
print ("+ Selecting random entries retrieved:")
142+
list_entries = random.sample(set(list(dict_entries.keys())), k=random_k)
143+
print ('\t%s entries selected out of %s' %(str(random_k), str(dict_entries_len)))
144+
145+
print()
140146

141147
## debug messages
142148
if debug:

0 commit comments

Comments
 (0)