Skip to content
72 changes: 45 additions & 27 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,12 @@ def get_summaries():
hmm = ["TIGRFAMs","HAMAP","dbCAN"]
taxonomy = ["phylum", "class", "order"]
# code to generate the possible files
file_paths = expand("summaries/combined/{hmm}_{taxonomy}.txt",
hmm=hmm, taxonomy=taxonomy)
file_paths = expand("summaries/combined/{hmm}_{tax_classification}.txt",
hmm=hmm, tax_classification=taxonomy)
file_paths.extend(expand("summaries/hmms_summary/{hmm}_summary.txt",
hmm=hmm))
file_paths.extend(expand("summaries/taxonomy/{taxonomy}.txt",
taxonomy=taxonomy))
file_paths.extend(expand("summaries/taxonomy/{tax_classification}.txt",
tax_classification=taxonomy))
return file_paths


Expand Down Expand Up @@ -394,7 +394,7 @@ rule aggregate_all_genes:
with open(f) as fh:
for name, seq, _ in readfx(fh):
# per sequence, choose first only
if name.endswith("_2"):
if not name.endswith("_1"):
continue
print(">%d" % name_index, seq.replace("*", ""), sep="\n", file=out_faa)
name_index += 1
Expand Down Expand Up @@ -481,7 +481,7 @@ rule split_fasta:
input:
fasta = "gene_catalog/clustered_genes.faa"
output:
temp((dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa")))
dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa")
params:
# consider a smaller chunk size
chunk_size = config.get("chunk_size", 1000000)
Expand Down Expand Up @@ -621,9 +621,23 @@ rule combine_sample_output:
"""


rule remove_empty_annotations:
input:
annotations = "gene_catalog/annotations.txt"
output:
cleaned = "gene_catalog/annotations_cleaned.txt"
run:
with open(input.annotations) as file, open(output.cleaned,"w") as output:
header = next(file)
print(header.strip("\n"), file=output)
for line in file:
toks = line.strip("\r\n").split("\t")
if not all(s=='' for s in toks[2:12]):
print(line.strip("\n"), file=output)

rule build_hmms_table:
input:
"gene_catalog/annotations.txt"
"gene_catalog/annotations_cleaned.txt"
output:
"summaries/hmms_summary/{hmm}_summary.txt"
params:
Expand All @@ -645,7 +659,7 @@ rule build_hmms_table:

rule build_tax_table:
input:
"gene_catalog/annotations.txt"
"gene_catalog/annotations_cleaned.txt"
output:
"summaries/taxonomy/{tax_classification}.txt"
params:
Expand All @@ -659,15 +673,15 @@ rule build_tax_table:
shell:
"""
python scripts/summarize_classifications.py \
--group-on kaiju_classification --min-evalue {params.min_evalue} \
--group-on kaiju_taxonomy --min-evalue {params.min_evalue} \
--min-score {params.min_score} --min-len {params.min_len} \
--tax-level {wildcards.tax_classification} {output} {input}
"""


rule build_hmm_and_tax_table:
input:
"gene_catalog/annotations.txt"
"gene_catalog/annotations_cleaned.txt"
output:
"summaries/combined/{hmm}_{tax_classification}.txt"
params:
Expand All @@ -679,12 +693,12 @@ rule build_hmm_and_tax_table:
shell:
"""
python scripts/summarize_classifications.py \
--group-on {wildcards.hmm} kaiju_classification \
--group-on {wildcards.hmm} kaiju_taxonomy \
--tax-level {wildcards.tax_classification} --min-evalue {params.min_evalue} \
--min-score {params.min_score}--min-len {params.min_len} {output} {input}
--min-score {params.min_score} --min-len {params.min_len} {output} {input}
"""

## TODO fix this rule

rule build_krona_ec_input:
input:
ec_file = "summaries/hmms_summary/TIGRFAMs_summary.txt",
Expand Down Expand Up @@ -731,30 +745,34 @@ rule build_krona_plots:
"""


# rule zip_attachments:
# input:
# function = "summaries/function/ec.txt",
# taxonomy = "summaries/taxonomy/order.txt",
# krona_tax = "krona_plots/tax.krona.html",
# krona_ec = "krona_plots/ec.krona.html"
# output:
# temp("perseq_downloads.zip")
# shell:
# """
# zip {output} {input.function} {input.taxonomy} {input.combined}
# """
rule zip_attachments:
input:
function = "summaries/hmms_summary/TIGRFAMs_summary.txt",
taxonomy = "summaries/taxonomy/order.txt",
combined = "summaries/combined/TIGRFAMs_order.txt",
krona_tax = "krona_plots/tax.krona.html",
krona_ec = "krona_plots/ec.krona.html"
output:
"perseq_downloads.tar.gz"
conda:
CONDAENV
shell:
"""
tar -czf {output} {input.function} {input.taxonomy} {input.combined} \
{input.krona_tax} {input.krona_ec}
"""


rule build_report:
input:
annotations = "gene_catalog/annotations.txt",
annotations = "gene_catalog/annotations_cleaned.txt",
ee_stats = expand("logs/{sample}_{idx}_eestats.txt", sample=config["samples"].keys(), idx=["R1", "R2"]),
clean_length_logs = expand("logs/{sample}_03_clean_readlengths.txt", sample=config["samples"].keys()),
unique_length_logs = expand("logs/{sample}_02_unique_readlengths.txt", sample=config["samples"].keys()),
clean_logs = expand("logs/{sample}_decontamination.log", sample=config["samples"].keys()),
merge_logs = expand("logs/{sample}_merge_sequences.log", sample=config["samples"].keys()),
taxonomy = "summaries/taxonomy/order.txt",
zipped_files = "perseq_downloads.zip"
zipped_files = "perseq_downloads.tar.gz"
output:
"summary.html"
conda:
Expand Down
2 changes: 1 addition & 1 deletion envs/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ dependencies:
- seqtk=1.3
- pip
- pip:
- relatively
- relatively>=1.0.5
- plotly>=3.0
127 changes: 80 additions & 47 deletions scripts/build_krona.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@ def build_tax(tax_file, output):
# read in tax file to order level from build_tax_table rule
test_tax = pd.read_table(tax_file)
col_names = list(test_tax)[1:]
taxonomies = ["Kingdom", "Phylum", "Class", "Order"]
# split taxonomy column into hierarchy
test_tax[
["Kingdom", "Phylum", "Class", "Order"]
] = test_tax.taxonomy_order.str.split("; ", expand=True)
] = test_tax.taxonomy_order.str.split("; ",n=4, expand=True)
# generate individual files per sample
for item in col_names:
df_name = f"test_tax_{item}"
Expand Down Expand Up @@ -85,72 +84,106 @@ def build_ec_dict(ec_filename, dat_filename):
return d


# def parse_ec_file(ec_file_from_summaries):
# """
# Split sequences that aligned to multiple ec numbers into last common ec number
# shared amongst matches
# """
# with gzopen(ec_file_from_summaries) as ec_file_sum:
#
# next(ec_file_sum)
# new_ec_dict = {"ec": []}
# # split ec numbers that mapped to multiple
# for item in ec_file_sum:
# toks = item.partition("\t")[0]
# ec = toks.split(";")
# # print(ec)
# first_item = ec[0][:5]
# if len(ec) == 1:
# new_ec_dict["ec"].append(ec[0])
# else:
# n = 0
# for item in ec[1:]:
# if item[:5] == first_item:
# new_ec_dict["ec"].append(item[:5] + ".-")
# # print('same',item,first_item)
# elif item[:3] == first_item[:3]:
# new_ec_dict["ec"].append(item[:3] + ".-.-")
# # print('same to high level',item,first_item)
# elif item[:1] == first_item[:1]:
# new_ec_dict["ec"].append(item[:1] + ".-.-.-")
# return new_ec_dict

def parse_ec_file(ec_file_from_summaries):
"""
Split sequences that aligned to multiple ec numbers into last common ec number
shared amongst matches
For ec numbers that aligned to multiple ecs in TIGRFAMs, the first entry is
the only one that will be retained for Krona plots.
"""
with gzopen(ec_file_from_summaries) as ec_file_sum:

next(ec_file_sum)
new_ec_dict = {"ec": []}
# split ec numbers that mapped to multiple
for item in ec_file_sum:
toks = item.partition("\t")[0]
ec = toks.split(";")
# print(ec)
first_item = ec[0][:5]
if len(ec) == 1:
new_ec_dict["ec"].append(ec[0])
else:
n = 0
for item in ec[1:]:
if item[:5] == first_item:
new_ec_dict["ec"].append(item[:5] + ".-")
# print('same',item,first_item)
elif item[:3] == first_item[:3]:
new_ec_dict["ec"].append(item[:3] + ".-.-")
# print('same to high level',item,first_item)
elif item[:1] == first_item[:1]:
new_ec_dict["ec"].append(item[:1] + ".-.-.-")
return new_ec_dict


def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output):
ec_df = pd.read_table("ec_file_from_summaries")
samples = ec_df.filter(regex="[0-9]+").columns.values.tolist()
ec_df = ec_df[["tigrfams_ec"]+samples]
ec_df = ec_df.groupby(["tigrfams_ec"]).sum(axis=1).reset_index()
ec_df[["tigrfams_ec"]] = ec_df.tigrfams_ec.str.split(",",n=1).str[0]
return ec_df


# def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output):
# """
# Splits the ec number counts and hierarchy by sample
# """
# #ec_replace = pd.DataFrame.from_dict(new_ec_dict)
# ec_table = pd.read_table(ec_file_from_summaries)
# col_names = list(ec_table)[1:]
# #ec_table["ec"] = ec_replace["ec"]
# ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index()
# ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"})
# # print(ec_hier_dict.head(5))
# grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner")
# # print(grouped_ec_tbl.head(5))
# grouped_ec_tbl = grouped_ec_tbl.rename(
# columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"}
# )
# # split the samples into separate files
# for item in col_names:
# df_name = f"test_tax_{item}"
# df_name = grouped_ec_tbl[
# [f"{item}", "level_1", "level_2", "level_3", "level_4"]
# ]
#
# df_name.to_csv(
# output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False
# )
def build_ec_output(ec_df, parsed_dict,output):
"""
Splits the ec number counts and hierarchy by sample
"""
ec_replace = pd.DataFrame.from_dict(new_ec_dict)
ec_table = pd.read_table(ec_file_from_summaries)
col_names = list(ec_table)[1:]
ec_table["ec"] = ec_replace["ec"]
ec_table = ec_df
samples = list(ec_table)[1:]
ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index()
ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"})
# print(ec_hier_dict.head(5))
grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner")
# print(grouped_ec_tbl.head(5))
ec_hier_dict = ec_hier_dict.rename(columns={"index": "tigrfams_ec"})
grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="tigrfams_ec", how="inner")
grouped_ec_tbl = grouped_ec_tbl.rename(
columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"}
)
# split the samples into separate files
for item in col_names:
df_name = f"test_tax_{item}"
for sample in samples:
#df_name = f"test_tax_{sample}"
df_name = grouped_ec_tbl[
[f"{item}", "level_1", "level_2", "level_3", "level_4"]
[f"{sample}", "level_1", "level_2", "level_3", "level_4"]
]

print(df_name.head())
df_name.to_csv(
output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False
output + "/" + f"{sample}" + "_ec.txt", sep="\t", index=False, header=False
)


def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file):
if tax_file is not None:
build_tax(tax_file, output)
else:
new_ec_dict = parse_ec_file(ec_file_from_summaries)
ec_df = parse_ec_file(ec_file_from_summaries)
parsed_dict = build_ec_dict(ec_file, dat_file)
build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output)
build_ec_output(ec_df, parsed_dict, output)


if __name__ == "__main__":
Expand All @@ -167,7 +200,7 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file):
p.add_argument(
"--ec-file-from-summaries",
type=str,
help="file that contains the ec numbers from the diamond alignment",
help="file that contains the ec numbers from TIGRFAMs summary",
)
p.add_argument(
"--dat-file",
Expand Down
Loading