Skip to content

Commit

Permalink
reran with black using line length 79
Browse files Browse the repository at this point in the history
  • Loading branch information
EC2 Default User committed Nov 29, 2023
1 parent 38a8de7 commit 96721a6
Show file tree
Hide file tree
Showing 16 changed files with 211 additions and 71 deletions.
12 changes: 8 additions & 4 deletions bioprojects/PRJEB13833/metadata/prepare-metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
if line.startswith("run"):
continue

run_accession, sample_accession, sample_alias, sample_title = line.split(
"\t"
)
(
run_accession,
sample_accession,
sample_alias,
sample_title,
) = line.split("\t")

_, yyyy, mm, dd, site = sample_title.split("_")

outf.write(
"%s\t%s-%s-%s\tCluster %s\n" % (run_accession, yyyy, mm, dd, site)
"%s\t%s-%s-%s\tCluster %s\n"
% (run_accession, yyyy, mm, dd, site)
)
4 changes: 3 additions & 1 deletion bioprojects/PRJEB49260/metadata/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@
dd = date[2:4]
yy = date[4:6]

outf.write("%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location))
outf.write(
"%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location)
)
4 changes: 3 additions & 1 deletion bioprojects/PRJNA729801/metadata/parse_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def start(raw_metadata_in, parsed_metadata_out):
data.sort()

with open(parsed_metadata_out, "w") as outf:
outf.write("\t".join(["filename", "date", "plant", "is_enriched"]) + "\n")
outf.write(
"\t".join(["filename", "date", "plant", "is_enriched"]) + "\n"
)
for plant, date, filename, is_enriched in data:
outf.write("\t".join([filename, date, plant, is_enriched]) + "\n")

Expand Down
12 changes: 9 additions & 3 deletions bioprojects/PRJNA812772/metadata/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@
with open("raw_metadata.tsv") as inf:
with open("metadata.tsv", "w") as outf:
for line in inf:
sample_accession, run_accession, sample_alias = line.strip().split("\t")
sample_accession, run_accession, sample_alias = line.strip().split(
"\t"
)

_, strategy = sample_alias.split("_")
if strategy == "sarscov2":
continue

collection_date = sample_accession_to_collection_date[sample_accession]
outf.write("%s\t%s\t%s\n" % (run_accession, strategy, collection_date))
collection_date = sample_accession_to_collection_date[
sample_accession
]
outf.write(
"%s\t%s\t%s\n" % (run_accession, strategy, collection_date)
)
8 changes: 6 additions & 2 deletions build_bowtie2_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def combine_genomes(combined_genomes_fname):
outf.writelines(inf.readlines())


def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname):
def mask_low_complexity_sequences(
combined_genomes_fname, masked_genomes_fname
):
if os.path.exists(masked_genomes_fname):
return
print("Masking low complexity sequences...")
Expand Down Expand Up @@ -149,7 +151,9 @@ def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname):
#
# This regexp replaces all lowercase letters that aren't on lines beginning
# with '>', which in FASTA means everywhere except in the sequence IDs.
subprocess.check_call(["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname])
subprocess.check_call(
["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname]
)


def build_db(bowtie_db_prefix, genomes_fname):
Expand Down
4 changes: 3 additions & 1 deletion count_clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
parents = {} # child_taxid -> parent_taxid
with open("dashboard/nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
parents[child_taxid] = parent_taxid
Expand Down
4 changes: 3 additions & 1 deletion dashboard/determine_comparison_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
children = defaultdict(list) # parent_taxid -> [children]
with open("nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
if child_taxid != parent_taxid:
Expand Down
4 changes: 3 additions & 1 deletion dashboard/determine_key_clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
children = defaultdict(set) # parent_taxid -> [children]
with open("nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
if child_taxid != parent_taxid:
Expand Down
37 changes: 28 additions & 9 deletions dashboard/prepare-dashboard-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
parents = {} # child_taxid -> parent_taxid
with open("%s/nodes.dmp" % DASHBOARD_DIR) as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
parents[child_taxid] = parent_taxid
Expand All @@ -51,7 +53,9 @@

# project -> sample -> n_reads
project_sample_reads = defaultdict(dict)
for metadata_fname in glob.glob("%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR):
for metadata_fname in glob.glob(
"%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR
):
project = metadata_fname.split("/")[-3]
if project in ["PRJEB30546", "PRJNA691135"]:
# didn't finish importing this one, and the dashboard chokes on papers
Expand Down Expand Up @@ -137,7 +141,9 @@
# paper -> {link, samples, projects, na_type, subset}
papers = {}
for project in projects:
with open("%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project)) as inf:
with open(
"%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project)
) as inf:
paper_name = inf.read().strip()
if paper_name not in papers:
papers[paper_name] = {}
Expand Down Expand Up @@ -169,7 +175,8 @@

def rc(s):
return "".join(
{"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x] for x in reversed(s)
{"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x]
for x in reversed(s)
)


Expand Down Expand Up @@ -289,7 +296,9 @@ def count_dups(hvr_fname):
taxonomic_names = defaultdict(list)
with open("%s/names.dmp" % DASHBOARD_DIR) as inf:
for line in inf:
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t")
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split(
"\t|\t"
)
taxid = int(taxid)

if taxid in mentioned_taxids or taxid in comparison_sample_counts:
Expand All @@ -305,19 +314,26 @@ def count_dups(hvr_fname):
sample_metadata = defaultdict(dict)

for project in projects:
with open("%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project)) as inf:
with open(
"%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project)
) as inf:
for line in inf:
if not line.strip():
continue
line = line[:-1] # drop trailing newline

sample, sample_metadata_dict = sample_metadata_classifier.interpret(
(
sample,
sample_metadata_dict,
) = sample_metadata_classifier.interpret(
project, papers, line.split("\t")
)
sample_metadata[sample] = sample_metadata_dict

for sample in project_sample_reads[project]:
sample_metadata[sample]["reads"] = project_sample_reads[project][sample]
sample_metadata[sample]["reads"] = project_sample_reads[project][
sample
]

rf_fname = "ribofrac/%s.ribofrac.txt" % sample
try:
Expand Down Expand Up @@ -349,7 +365,10 @@ def count_dups(hvr_fname):
]:
with open(DASHBOARD_DIR + name + ".json", "w") as outf:
json.dump(
val, outf, sort_keys=True, indent=None if val is human_virus_tree else 2
val,
outf,
sort_keys=True,
indent=None if val is human_virus_tree else 2,
)

# To make the dashboard load faster, divide counts by bioproject and don't load
Expand Down
26 changes: 21 additions & 5 deletions dashboard/sample_metadata_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,18 @@ def interpret(project, papers, bits):
elif project in papers["Bengtsson-Palme 2016"]["projects"]:
sample, location, site = bits
return sample, dict(
date="2012-09", country="Sweden", location=location, fine_location=site
date="2012-09",
country="Sweden",
location=location,
fine_location=site,
)
elif project in papers["Brinch 2020"]["projects"]:
sample, loc, date = bits
return sample, dict(
date=date, country="Denmark", location="Copenhagen", fine_location=loc
date=date,
country="Denmark",
location="Copenhagen",
fine_location=loc,
)
elif project in papers["Spurbeck 2023"]["projects"]:
sample, loc, date = bits
Expand Down Expand Up @@ -158,7 +164,10 @@ def interpret(project, papers, bits):
elif project in papers["Hendriksen 2019"]["projects"]:
sample, date, cluster = bits
return sample, dict(
country="Kenya", location="Kibera", fine_location=cluster, date=date
country="Kenya",
location="Kibera",
fine_location=cluster,
date=date,
)
elif project in papers["Yang 2020"]["projects"]:
sample, city = bits
Expand All @@ -168,7 +177,10 @@ def interpret(project, papers, bits):
elif project in papers["Wang 2022"]["projects"]:
sample, date, hospital = bits
return sample, dict(
country="Saudi Arabia", location="Jeddah", date=date, fine_location=hospital
country="Saudi Arabia",
location="Jeddah",
date=date,
fine_location=hospital,
)
elif project in papers["Cui 2023"]["projects"]:
(sample,) = bits
Expand Down Expand Up @@ -259,7 +271,11 @@ def interpret(project, papers, bits):
sample, _, enrichment, loc, city_state, date, flow = bits
city, state = city_state.split(", ")
record = dict(
country="United States", city=city, state="Texas", location=loc, date=date
country="United States",
city=city,
state="Texas",
location=loc,
date=date,
)
if enrichment == "1":
record["enrichment"] = "panel"
Expand Down
8 changes: 6 additions & 2 deletions expand-human-viruses.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
children = {}
with open("dashboard/nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)

Expand All @@ -42,7 +44,9 @@ def add_children(taxid):
taxonomic_names = {}
with open("dashboard/names.dmp") as inf:
for line in inf:
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t")
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split(
"\t|\t"
)
taxid = int(taxid)

if taxid in hv:
Expand Down
8 changes: 6 additions & 2 deletions papers/Brinch2020/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
sample_to_details[bits[7]] = bits[15], bits[17]

for project in ["PRJEB34633", "PRJEB13832"]:
with open("../../bioprojects/%s/metadata/metadata_raw.tsv" % project) as inf:
with open("../../bioprojects/%s/metadata/metadata.tsv" % project, "w") as outf:
with open(
"../../bioprojects/%s/metadata/metadata_raw.tsv" % project
) as inf:
with open(
"../../bioprojects/%s/metadata/metadata.tsv" % project, "w"
) as outf:
for line in inf:
sample = line.strip()
if sample not in sample_to_details:
Expand Down
4 changes: 3 additions & 1 deletion papers/Munk2022/prepare-metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ def clean_date(x):

bioproject_dir = os.path.join(root, "bioprojects")
for bioproject in os.listdir(bioproject_dir):
with open(os.path.join(bioproject_dir, bioproject, "metadata", "name.txt")) as inf:
with open(
os.path.join(bioproject_dir, bioproject, "metadata", "name.txt")
) as inf:
if inf.read().strip() != "Munk 2022":
continue

Expand Down
4 changes: 3 additions & 1 deletion pipeline-operation/screen-summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ def start():
with tempfile.TemporaryDirectory() as workdir:
tmpfname = os.path.join(workdir, "tmp.txt")

subprocess.check_call(["screen", "-S", screen, "-X", "hardcopy", tmpfname])
subprocess.check_call(
["screen", "-S", screen, "-X", "hardcopy", tmpfname]
)

# wait for screen to dump like we asked
while not os.path.exists(tmpfname):
Expand Down
7 changes: 5 additions & 2 deletions reprocess-bioprojects.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
restricted_bioprojects = []
restricted_dir = os.path.join("..", "mgs-restricted")
if os.path.exists(restricted_dir):
restricted_bioprojects = os.listdir(os.path.join(restricted_dir, "bioprojects"))
restricted_bioprojects = os.listdir(
os.path.join(restricted_dir, "bioprojects")
)


def prepare_job(bioproject, log_prefix, run_args):
Expand Down Expand Up @@ -85,7 +87,8 @@ def start():
help="Log prefix, for storing this run under log/",
)
parser.add_argument(
"--bioprojects", help="The IDs of the bioproject to process, comma separated"
"--bioprojects",
help="The IDs of the bioproject to process, comma separated",
)
args = parser.parse_args(our_args)

Expand Down
Loading

0 comments on commit 96721a6

Please sign in to comment.