From 4945d8c9b56dcc3aedcc027d9e3647f1e836cb02 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 27 Nov 2018 12:49:54 -0800 Subject: [PATCH 01/14] fixed ec krona building --- Snakefile | 32 ++++++----- scripts/build_krona.py | 127 ++++++++++++++++++++++++++--------------- 2 files changed, 97 insertions(+), 62 deletions(-) diff --git a/Snakefile b/Snakefile index 463699e..cedabe9 100644 --- a/Snakefile +++ b/Snakefile @@ -394,7 +394,7 @@ rule aggregate_all_genes: with open(f) as fh: for name, seq, _ in readfx(fh): # per sequence, choose first only - if name.endswith("_2"): + if not name.endswith("_1"): continue print(">%d" % name_index, seq.replace("*", ""), sep="\n", file=out_faa) name_index += 1 @@ -481,7 +481,7 @@ rule split_fasta: input: fasta = "gene_catalog/clustered_genes.faa" output: - temp((dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa"))) + dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa") params: # consider a smaller chunk size chunk_size = config.get("chunk_size", 1000000) @@ -684,7 +684,7 @@ rule build_hmm_and_tax_table: --min-score {params.min_score}--min-len {params.min_len} {output} {input} """ -## TODO fix this rule + rule build_krona_ec_input: input: ec_file = "summaries/hmms_summary/TIGRFAMs_summary.txt", @@ -731,18 +731,20 @@ rule build_krona_plots: """ -# rule zip_attachments: -# input: -# function = "summaries/function/ec.txt", -# taxonomy = "summaries/taxonomy/order.txt", -# krona_tax = "krona_plots/tax.krona.html", -# krona_ec = "krona_plots/ec.krona.html" -# output: -# temp("perseq_downloads.zip") -# shell: -# """ -# zip {output} {input.function} {input.taxonomy} {input.combined} -# """ +rule zip_attachments: + input: + function = "summaries/hmms_summary/TIGRFAMs_summary.txt", + taxonomy = "summaries/taxonomy/order.txt", + combined = "summaries/combined/TIGRFAMs_order.txt", + krona_tax = "krona_plots/tax.krona.html", + krona_ec = "krona_plots/ec.krona.html" + output: + temp("perseq_downloads.zip") + shell: + """ + zip {output} {input.function} {input.taxonomy} {input.combined} \ + {input.krona_tax} {input.krona_ec} + """ rule build_report: diff --git a/scripts/build_krona.py b/scripts/build_krona.py index 6222eed..0fc5642 100755 --- a/scripts/build_krona.py +++ b/scripts/build_krona.py @@ -17,11 +17,10 @@ def build_tax(tax_file, output): # read in tax file to order level from build_tax_table rule test_tax = pd.read_table(tax_file) col_names = list(test_tax)[1:] - taxonomies = ["Kingdom", "Phylum", "Class", "Order"] # split taxonomy column into hierarchy test_tax[ ["Kingdom", "Phylum", "Class", "Order"] - ] = test_tax.taxonomy_order.str.split("; ", expand=True) + ] = test_tax.taxonomy_order.str.split("; ",n=4, expand=True) # generate individual files per sample for item in col_names: df_name = f"test_tax_{item}" @@ -85,62 +84,96 @@ def build_ec_dict(ec_filename, dat_filename): return d +# def parse_ec_file(ec_file_from_summaries): +# """ +# Split sequences that aligned to multiple ec numbers into last common ec number +# shared amongst matches +# """ +# with gzopen(ec_file_from_summaries) as ec_file_sum: +# +# next(ec_file_sum) +# new_ec_dict = {"ec": []} +# # split ec numbers that mapped to multiple +# for item in ec_file_sum: +# toks = item.partition("\t")[0] +# ec = toks.split(";") +# # print(ec) +# first_item = ec[0][:5] +# if len(ec) == 1: +# new_ec_dict["ec"].append(ec[0]) +# else: +# n = 0 +# for item in ec[1:]: +# if item[:5] == first_item: +# new_ec_dict["ec"].append(item[:5] + ".-") +# # print('same',item,first_item) +# elif item[:3] == first_item[:3]: +# new_ec_dict["ec"].append(item[:3] + ".-.-") +# # print('same to high level',item,first_item) +# elif item[:1] == first_item[:1]: +# new_ec_dict["ec"].append(item[:1] + ".-.-.-") +# return new_ec_dict + def parse_ec_file(ec_file_from_summaries): """ - Split sequences that aligned to multiple ec numbers into last common ec number - shared amongst matches + For ec numbers that aligned to multiple ecs in TIGRFAMs, the first entry is + the only one that will be retained for Krona plots. """ - with gzopen(ec_file_from_summaries) as ec_file_sum: - - next(ec_file_sum) - new_ec_dict = {"ec": []} - # split ec numbers that mapped to multiple - for item in ec_file_sum: - toks = item.partition("\t")[0] - ec = toks.split(";") - # print(ec) - first_item = ec[0][:5] - if len(ec) == 1: - new_ec_dict["ec"].append(ec[0]) - else: - n = 0 - for item in ec[1:]: - if item[:5] == first_item: - new_ec_dict["ec"].append(item[:5] + ".-") - # print('same',item,first_item) - elif item[:3] == first_item[:3]: - new_ec_dict["ec"].append(item[:3] + ".-.-") - # print('same to high level',item,first_item) - elif item[:1] == first_item[:1]: - new_ec_dict["ec"].append(item[:1] + ".-.-.-") - return new_ec_dict - - -def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output): + ec_df = pd.read_table("ec_file_from_summaries") + samples = ec_df.filter(regex="[0-9]+").columns.values.tolist() + ec_df = ec_df[["tigrfams_ec"]+samples] + ec_df = ec_df.groupby(["tigrfams_ec"]).sum(axis=1).reset_index() + ec_df[["tigrfams_ec"]] = ec_df.tigrfams_ec.str.split(",",n=1).str[0] + return ec_df + + +# def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output): +# """ +# Splits the ec number counts and hierarchy by sample +# """ +# #ec_replace = pd.DataFrame.from_dict(new_ec_dict) +# ec_table = pd.read_table(ec_file_from_summaries) +# col_names = list(ec_table)[1:] +# #ec_table["ec"] = ec_replace["ec"] +# ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index() +# ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"}) +# # print(ec_hier_dict.head(5)) +# grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner") +# # print(grouped_ec_tbl.head(5)) +# grouped_ec_tbl = grouped_ec_tbl.rename( +# columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"} +# ) +# # split the samples into separate files +# for item in col_names: +# df_name = f"test_tax_{item}" +# df_name = grouped_ec_tbl[ +# [f"{item}", "level_1", "level_2", "level_3", "level_4"] +# ] +# +# df_name.to_csv( +# output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False +# ) +def build_ec_output(ec_df, parsed_dict,output): """ Splits the ec number counts and hierarchy by sample """ - ec_replace = pd.DataFrame.from_dict(new_ec_dict) - ec_table = pd.read_table(ec_file_from_summaries) - col_names = list(ec_table)[1:] - ec_table["ec"] = ec_replace["ec"] + ec_table = ec_df + samples = list(ec_table)[1:] ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index() - ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"}) - # print(ec_hier_dict.head(5)) - grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner") - # print(grouped_ec_tbl.head(5)) + ec_hier_dict = ec_hier_dict.rename(columns={"index": "tigrfams_ec"}) + grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="tigrfams_ec", how="inner") grouped_ec_tbl = grouped_ec_tbl.rename( columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"} ) # split the samples into separate files - for item in col_names: - df_name = f"test_tax_{item}" + for sample in samples: + #df_name = f"test_tax_{sample}" df_name = grouped_ec_tbl[ - [f"{item}", "level_1", "level_2", "level_3", "level_4"] + [f"{sample}", "level_1", "level_2", "level_3", "level_4"] ] - + print(df_name.head()) df_name.to_csv( - output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False + output + "/" + f"{sample}" + "_ec.txt", sep="\t", index=False, header=False ) @@ -148,9 +181,9 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file): if tax_file is not None: build_tax(tax_file, output) else: - new_ec_dict = parse_ec_file(ec_file_from_summaries) + ec_df = parse_ec_file(ec_file_from_summaries) parsed_dict = build_ec_dict(ec_file, dat_file) - build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output) + build_ec_output(ec_df, parsed_dict, output) if __name__ == "__main__": @@ -167,7 +200,7 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file): p.add_argument( "--ec-file-from-summaries", type=str, - help="file that contains the ec numbers from the diamond alignment", + help="file that contains the ec numbers from TIGRFAMs summary", ) p.add_argument( "--dat-file", From bc175b739acbde095ab041e043fbbe7aa9ea60ce Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 27 Nov 2018 12:50:44 -0800 Subject: [PATCH 02/14] altered summary scripts --- scripts/make_classification_table.py | 8 +- scripts/summarize_classifications.py | 137 +++++++++++++++------------ 2 files changed, 78 insertions(+), 67 deletions(-) mode change 100644 => 100755 scripts/make_classification_table.py diff --git a/scripts/make_classification_table.py b/scripts/make_classification_table.py old mode 100644 new mode 100755 index 6214129..ac99a76 --- a/scripts/make_classification_table.py +++ b/scripts/make_classification_table.py @@ -48,11 +48,11 @@ def main(kaiju, hamap, dbcan, tigrfams, diamond, output): sequence_counts = parse_diamond_outputs(diamond) # ECs may be a comma delimited list - # sequence_id -> [ec, gene, product, hmm_id, score, evalue] + # sequence_id -> [ec, gene, product, hmm_id, evalue, score] hamap_hits = parse_hmm_hits(hamap) - # sequence_id -> [ec, enzyme class, enzyme class subfamily, hmm_id, score, evalue] + # sequence_id -> [ec, enzyme class, enzyme class subfamily, hmm_id, evalue, score] dbcan_hits = parse_hmm_hits(dbcan) - # sequence_id -> [ec, gene, product, hmm_id, score, evalue] + # sequence_id -> [ec, gene, product, hmm_id, evalue, score] tigrfams_hits = parse_hmm_hits(tigrfams) output_header = [ @@ -108,7 +108,7 @@ def main(kaiju, hamap, dbcan, tigrfams, diamond, output): tigrfams_ec, tigrfams_gene, tigrfams_product, - tigerfams_score, + tigrfams_score, tigrfams_evalue, hamap_ec, hamap_gene, diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py index 9e2afa0..eb0f89e 100755 --- a/scripts/summarize_classifications.py +++ b/scripts/summarize_classifications.py @@ -57,71 +57,82 @@ # logging.debug(f"Final table length: {len(df)}") # return df -def single_output(df,tax_level,samples,min_evlaue,min_score,min_len): +def single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output): split_idx = TAX_LEVELS[tax_level] - group_one=["tigrfams","hamap","dbcan","kaiju"] - for item in group_one: - hmm_cols = df.filter(like=item).columns.values.tolist() - #TODO might not need - test_df = df[hmm_cols+samples].copy() - if "kaiju" in item: - test_df = test_df[(df["kaiju_length"] > min_len)] - test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) - test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() - test_df = test_df.rename( - columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} - ) - else: - metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() - test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] - test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head() - test_df.to_csv(item+output, sep="\t", index=False) - - -def combined_output(df,tax_level,samples,min_evalue,min_score,min_len): - split_idx = TAX_LEVELS[tax_level] - group_one=["tigrfams","hamap","dbcan"] - kaiju_cols = df.filter(like="kaiju").columns.values.tolist() - for item in group_one: - hmm_cols = df.filter(like=item).columns.values.tolist() - #TODO might not need - test_df = df[kaiju_cols+hmm_cols+samples].copy() - test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] + # group_one=["tigrfams","hamap","dbcan","kaiju"] + # for item in group_one: + hmm_cols = df.filter(like=group_on[0]).columns.values.tolist() + #TODO might not need + test_df = df[hmm_cols+samples].copy() + if "kaiju" in item: + test_df = test_df[(df["kaiju_length"] > min_len)] test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) - metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() - metric_cols.append("kaiju_length") - test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index() + test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() test_df = test_df.rename( columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} ) - test_df.to_csv("taxonomy_"+item, sep="\t", index=False) + else: + metric_cols = test_df.filter(items=[group_on[0]+"_score",group_on[0]+"_evalue"]).columns.values.tolist() + test_df = test_df[(test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)] + test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head() + test_df.to_csv(item+output, sep="\t", index=False) -def main(output, table, tax_level, min_evalue, min_score, min_len): +def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output): + split_idx = TAX_LEVELS[tax_level] + # group_one=["tigrfams","hamap","dbcan"] + kaiju_cols = df.filter(like="kaiju").columns.values.tolist() + # for item in group_one: + hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist() + #TODO might not need + test_df = df[kaiju_cols+hmm_cols+samples].copy() + test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0]+"_score"] > min_score) & (test_df[group_on[0]+"_evalue"] < min_evalue)] + test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) + metric_cols = test_df.filter(items=[group_on[0].lower()+"_score",group_on[0].lower()+"_evalue"]).columns.values.tolist() + metric_cols.append("kaiju_length") + test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index() + test_df = test_df.rename( + columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} + ) + test_df.to_csv(output, sep="\t", index=False) + + +def main(output, table, tax_level, min_evalue, min_score, min_len,group_on): df = pd.read_table(table) # remove the rows w/ no information - df = df.query("kaiju_length != 0") + df = df.dropna( + subset=[ + "kaiju_taxonomy", "tigrfams_ec", "tigrfams_gene", "tigrfams_product", + "tigrfams_score", "tigrfams_evalue", "hamap_ec", "hamap_gene", + "hamap_product", "hamap_score", "hamap_evalue", "dbcan_ec", + "dbcan_enzyme_class", "dbcan_enzyme_class_subfamily" + ], + thresh=1) samples = df.filter(regex="[0-9]+").columns.values.tolist() - split_idx = TAX_LEVELS[tax_level] - group_one=["tigrfams","hamap","dbcan","kaiju"] - for item in group_one: - print(item) - use_these = df.filter(like=item).columns.values.tolist() - print(use_these) - #TODO might not need - test_df = df[use_these+samples].copy() - if "kaiju" in item: - test_df = test_df[(df["kaiju_length"] > min_len)] - test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) - test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() - test_df = test_df.rename( - columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} - ) - else: - metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() - test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] - test_df = test_df.drop(columns=metric_cols).groupby(use_these[0:3]).sum(axis=1).reset_index().head() - test_df.to_csv(item+output, sep="\t", index=False) + if len(group_on) > 1: + combined_output(df,tax_level,min_evalue, min_score, min_len, group_on,output) + else: + single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output) + # split_idx = TAX_LEVELS[tax_level] + # group_one=["tigrfams","hamap","dbcan","kaiju"] + # for item in group_one: + # print(item) + # use_these = df.filter(like=item).columns.values.tolist() + # print(use_these) + # #TODO might not need + # test_df = df[use_these+samples].copy() + # if "kaiju" in item: + # test_df = test_df[(df["kaiju_length"] > min_len)] + # test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) + # test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() + # test_df = test_df.rename( + # columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} + # ) + # else: + # metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() + # test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] + # test_df = test_df.drop(columns=metric_cols).groupby(use_these[0:3]).sum(axis=1).reset_index().head() + # test_df.to_csv(item+output, sep="\t", index=False) # tax_split_level = TAX_LEVELS[tax_level] # sample_df = None # logging.debug(f"Preparing to parse {len(tables)} tables") @@ -185,13 +196,13 @@ def main(output, table, tax_level, min_evalue, min_score, min_len): default=0.001, help="minimum kaiju alignment length to retain", ) - # parser.add_argument( - # "--group-on", - # # choices=["kaiju_classification", "ko", "ec", "product"], - # nargs="+", - # default="kaiju_taxonomy", - # help="cluster tables on taxonomy and/or function; headers must match in tables", - # ) + parser.add_argument( + "--group-on", + # choices=["kaiju_classification", "ko", "ec", "product"], + nargs="+", + default="kaiju_taxonomy", + help="cluster tables on taxonomy and/or function; headers must match in tables", + ) parser.add_argument( "--verbose", action="store_true", help="increase output verbosity" ) @@ -205,5 +216,5 @@ def main(output, table, tax_level, min_evalue, min_score, min_len): args.min_evalue, args.min_score, args.min_len, - # args.group_on, + args.group_on, ) From 69e6f0eb9208a3e883606afdef957d54db62fa02 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 27 Nov 2018 16:22:59 -0800 Subject: [PATCH 03/14] fix error in combined output rule --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index cedabe9..a6a6f00 100644 --- a/Snakefile +++ b/Snakefile @@ -681,7 +681,7 @@ rule build_hmm_and_tax_table: python scripts/summarize_classifications.py \ --group-on {wildcards.hmm} kaiju_classification \ --tax-level {wildcards.tax_classification} --min-evalue {params.min_evalue} \ - --min-score {params.min_score}--min-len {params.min_len} {output} {input} + --min-score {params.min_score} --min-len {params.min_len} {output} {input} """ From 2e8b7821c10bcbd5dc8b648f77fcbf5251fb691f Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 27 Nov 2018 16:23:38 -0800 Subject: [PATCH 04/14] fixes report --- scripts/build_report.py | 82 +++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/scripts/build_report.py b/scripts/build_report.py index ace0eaa..e63bb73 100644 --- a/scripts/build_report.py +++ b/scripts/build_report.py @@ -1,4 +1,4 @@ -import argparse + import argparse import csv import os from collections import Counter, defaultdict @@ -45,16 +45,17 @@ def get_sample_order(lst): return [i[0] for i in sorted(lst, key=lambda x: x[1], reverse=True)] -def build_taxonomy_plot(txt, value_cols, height=900): +def build_taxonomy_plot(txt, height=900): df = pd.read_table(txt) + samples = df.filter(regex="[0-9]+").columns.values.tolist() levels = ["kingdom", "phylum", "class", "order"] df[levels] = df["taxonomy_order"].str.split(";", expand=True) hierarchy = ["phylum", "class", "order"] df[hierarchy] = df[hierarchy].fillna("NA") - df[value_cols] = df[value_cols].fillna(0) + df[samples] = df[samples].fillna(0) df = df[hierarchy + value_cols] dfs = relatively.get_dfs_across_hierarchy( - df, hierarchy, value_cols, reorder="shannon", dependent="; " + df, hierarchy, samples, reorder="shannon", dependent="; " ) fig = relatively.get_abundance_figure_from_dfs( dfs, hierarchy, "Assigned Taxonomy Per Sample", height=height @@ -66,11 +67,47 @@ def get_sample_name(path, key): return [os.path.basename(item).partition(key)[0] for item in path] +# def parse_classifications_for_taxonomy(path): +# """ +# SN1035:381:h3cv7bcx2:2:2202:3165:73750 -1 -1 +# SN1035:381:h3cv7bcx2:1:1105:7280:88523 67.5 40 111 Archaea; Thaumarchaeota; Nitrososphaerales; Nitrososphaeria; Nitrososphaeraceae; Candidatus Nitrosocosmicus; Candidatus Nitrocosmicus oleophilus; +# SN1035:381:h3cv7bcx2:1:1204:20140:83036 78.4 37 ko:K00820 glmS, GFPT; glucosamine---fructose-6-phosphate aminotransferase (isomerizing) 2.6.1.16 229 Bacteria; Actinobacteria; NA; NA; NA; NA; Actinobacteria bacterium 13_1_40CM_66_12; +# """ +# logger.info("Parsing {}".format(path)) +# # hardcoded tax levels :( +# taxonomy_level_counter = { +# "order": Counter(), +# "class": Counter(), +# "phylum": Counter(), +# } +# taxonomy_counter = Counter() +# summary_counter = Counter() +# with open(path) as fh: +# # skip the header +# next(fh) +# for i, line in enumerate(fh, start=1): +# toks = line.strip("\r\n").split("\t") +# if toks[3]: +# summary_counter.update(["Assigned Function"]) +# if toks[7]: +# summary_counter.update(["Assigned Both"]) +# if toks[7]: +# taxonomy_counter.update([toks[7]]) +# taxonomy = [j.strip() for j in toks[7].split(";")] +# taxonomy_level_counter["order"].update([taxonomy[3]]) +# taxonomy_level_counter["class"].update([taxonomy[2]]) +# taxonomy_level_counter["phylum"].update([taxonomy[1]]) +# summary_counter.update(["Assigned Taxonomy"]) +# return dict( +# taxonomy_level_counter=taxonomy_level_counter, +# summary_counter=summary_counter +# ) + def parse_classifications_for_taxonomy(path): """ - SN1035:381:h3cv7bcx2:2:2202:3165:73750 -1 -1 - SN1035:381:h3cv7bcx2:1:1105:7280:88523 67.5 40 111 Archaea; Thaumarchaeota; Nitrososphaerales; Nitrososphaeria; Nitrososphaeraceae; Candidatus Nitrosocosmicus; Candidatus Nitrocosmicus oleophilus; - SN1035:381:h3cv7bcx2:1:1204:20140:83036 78.4 37 ko:K00820 glmS, GFPT; glucosamine---fructose-6-phosphate aminotransferase (isomerizing) 2.6.1.16 229 Bacteria; Actinobacteria; NA; NA; NA; NA; Actinobacteria bacterium 13_1_40CM_66_12; + 37056 274 Bacteria; Bacteroidetes; Sphingobacteriia; Sphingobacteriales; NA; NA; Sphingobacteriales bacterium UTBCD1; 3.4.21.92 clpP ATP-dependent Clp endopeptidase proteolytic subun + 75331 0 0 0 0 0 0 0 0 0 + """ logger.info("Parsing {}".format(path)) # hardcoded tax levels :( @@ -86,32 +123,37 @@ def parse_classifications_for_taxonomy(path): next(fh) for i, line in enumerate(fh, start=1): toks = line.strip("\r\n").split("\t") - if toks[3]: - summary_counter.update(["Assigned Function"]) - if toks[7]: - summary_counter.update(["Assigned Both"]) - if toks[7]: + if toks[2]: taxonomy_counter.update([toks[7]]) taxonomy = [j.strip() for j in toks[7].split(";")] taxonomy_level_counter["order"].update([taxonomy[3]]) taxonomy_level_counter["class"].update([taxonomy[2]]) taxonomy_level_counter["phylum"].update([taxonomy[1]]) summary_counter.update(["Assigned Taxonomy"]) + if not all(s=='' for s in toks[4:12]): + summary_counter.update(["Assigned Both"]) + if not all(s=='' for s in toks[4:12]): + summary_counter.update(["Assigned Function"]) + return dict( taxonomy_level_counter=taxonomy_level_counter, summary_counter=summary_counter ) - def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]): """ Reads in multiple sample alignments from diamond in a given directory and merges them into - a single pandas.DataFrame. It returns a pandas dataframe for each of th + a single pandas.DataFrame. It returns a pandas dataframe for each of phylum that is ready to plug into the processing function. Also returns total counts which is necessary to calculate the percentage of total that is being represented """ dfs = {} classifications_per_sample = {} + cols = df.filter(regex="[A-Z]+").columns.values.tolist() + samples = df.filter(regex="[0-9]+").columns.values.tolist() + for sample in samples: + df[cols] + for classification_table in classification_tables: sample = get_sample(classification_table, "_classifications.txt") parsed_taxonomy = parse_classifications_for_taxonomy(classification_table) @@ -311,14 +353,14 @@ def main( clean_logs = glob(clean_logs) unique_logs = glob(unique_logs) merge_logs = glob(merge_logs) - summary_tables = glob(summary_tables) + summary_table = summary_tables r1_quality_files = glob(r1_quality_files) - classifications_per_sample = compile_summary_df(summary_tables) - value_cols = get_sample_name(summary_tables, "_classifications.txt") - fig = build_taxonomy_plot(taxonomy_table, value_cols) + #classifications_per_sample = compile_summary_df(summary_tables) + #value_cols = get_sample_name(summary_tables, "_classifications.txt") + fig = build_taxonomy_plot(taxonomy_table) plots = offline.plot(fig, **PLOTLY_PARAMS) html_tbl = parse_log_files( - merge_logs, unique_logs, clean_logs, classifications_per_sample + merge_logs, unique_logs, clean_logs, summary_table ) quality_plot = build_quality_plot(r1_quality_files) conda_env = get_conda_env_str(conda_env) @@ -333,7 +375,7 @@ def main( PerSeq_ - Per sequence functional and taxonomic assignments ============================================================= -.. _PerSeq: https://github.com/pnnl +.. _PerSeq: https://github.com/PNNL-CompBio/perseq .. contents:: From c85d26a3c0300264bc0ce9944ed4f096b4b17e8c Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 27 Nov 2018 16:24:09 -0800 Subject: [PATCH 05/14] fixes evalue arg type --- scripts/summarize_classifications.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py index eb0f89e..46baf37 100755 --- a/scripts/summarize_classifications.py +++ b/scripts/summarize_classifications.py @@ -192,7 +192,7 @@ def main(output, table, tax_level, min_evalue, min_score, min_len,group_on): ) parser.add_argument( "--min-len", - type=int, + type=float, default=0.001, help="minimum kaiju alignment length to retain", ) From 61af7ee141d3511ea6799693dbf5ef1f38ad9dd8 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 27 Nov 2018 17:33:44 -0800 Subject: [PATCH 06/14] adds rule to remove empty annotations --- Snakefile | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index a6a6f00..697ac5f 100644 --- a/Snakefile +++ b/Snakefile @@ -621,9 +621,22 @@ rule combine_sample_output: """ +rule remove_empty_annotations: + input: + annotations = "gene_catalog/annotations.txt" + output: + cleaned = "gene_catalog/annotations_cleaned.txt" + run: + with open(input.annotations) as file,open(output.cleaned,"w") as output: + next(file) + for line in file: + toks = line.strip("\r\n").split("\t") + if not all(s=='' for s in toks[4:12]): + print(line, end="\n", file=output) + rule build_hmms_table: input: - "gene_catalog/annotations.txt" + "gene_catalog/annotations_cleaned.txt" output: "summaries/hmms_summary/{hmm}_summary.txt" params: @@ -645,7 +658,7 @@ rule build_hmms_table: rule build_tax_table: input: - "gene_catalog/annotations.txt" + "gene_catalog/annotations_cleaned.txt" output: "summaries/taxonomy/{tax_classification}.txt" params: @@ -667,7 +680,7 @@ rule build_tax_table: rule build_hmm_and_tax_table: input: - "gene_catalog/annotations.txt" + "gene_catalog/annotations_cleaned.txt" output: "summaries/combined/{hmm}_{tax_classification}.txt" params: @@ -749,7 +762,7 @@ rule zip_attachments: rule build_report: input: - annotations = "gene_catalog/annotations.txt", + annotations = "gene_catalog/annotations_cleaned.txt", ee_stats = expand("logs/{sample}_{idx}_eestats.txt", sample=config["samples"].keys(), idx=["R1", "R2"]), clean_length_logs = expand("logs/{sample}_03_clean_readlengths.txt", sample=config["samples"].keys()), unique_length_logs = expand("logs/{sample}_02_unique_readlengths.txt", sample=config["samples"].keys()), From 3c296831faa229a083af0487a31a67d17305af06 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Mon, 3 Dec 2018 15:42:29 -0800 Subject: [PATCH 07/14] fixed annotations parsing --- Snakefile | 27 ++++++++------- scripts/build_report.py | 50 +++++++++++++++++----------- scripts/summarize_classifications.py | 31 +++++++++-------- 3 files changed, 63 insertions(+), 45 deletions(-) diff --git a/Snakefile b/Snakefile index 697ac5f..e2835ca 100644 --- a/Snakefile +++ b/Snakefile @@ -136,12 +136,12 @@ def get_summaries(): hmm = ["TIGRFAMs","HAMAP","dbCAN"] taxonomy = ["phylum", "class", "order"] # code to generate the possible files - file_paths = expand("summaries/combined/{hmm}_{taxonomy}.txt", - hmm=hmm, taxonomy=taxonomy) + file_paths = expand("summaries/combined/{hmm}_{tax_classification}.txt", + hmm=hmm, tax_classification=taxonomy) file_paths.extend(expand("summaries/hmms_summary/{hmm}_summary.txt", hmm=hmm)) - file_paths.extend(expand("summaries/taxonomy/{taxonomy}.txt", - taxonomy=taxonomy)) + file_paths.extend(expand("summaries/taxonomy/{tax_classification}.txt", + tax_classification=taxonomy)) return file_paths @@ -627,12 +627,13 @@ rule remove_empty_annotations: output: cleaned = "gene_catalog/annotations_cleaned.txt" run: - with open(input.annotations) as file,open(output.cleaned,"w") as output: - next(file) + with open(input.annotations) as file, open(output.cleaned,"w") as output: + header = next(file) + print(header.strip("\n"), file=output) for line in file: toks = line.strip("\r\n").split("\t") if not all(s=='' for s in toks[4:12]): - print(line, end="\n", file=output) + print(line.strip("\n"), file=output) rule build_hmms_table: input: @@ -672,7 +673,7 @@ rule build_tax_table: shell: """ python scripts/summarize_classifications.py \ - --group-on kaiju_classification --min-evalue {params.min_evalue} \ + --group-on kaiju_taxonomy --min-evalue {params.min_evalue} \ --min-score {params.min_score} --min-len {params.min_len} \ --tax-level {wildcards.tax_classification} {output} {input} """ @@ -692,7 +693,7 @@ rule build_hmm_and_tax_table: shell: """ python scripts/summarize_classifications.py \ - --group-on {wildcards.hmm} kaiju_classification \ + --group-on {wildcards.hmm} kaiju_taxonomy \ --tax-level {wildcards.tax_classification} --min-evalue {params.min_evalue} \ --min-score {params.min_score} --min-len {params.min_len} {output} {input} """ @@ -752,10 +753,12 @@ rule zip_attachments: krona_tax = "krona_plots/tax.krona.html", krona_ec = "krona_plots/ec.krona.html" output: - temp("perseq_downloads.zip") + temp("perseq_downloads.tar.gz") + conda: + CONDAENV shell: """ - zip {output} {input.function} {input.taxonomy} {input.combined} \ + tar -czf {output} {input.function} {input.taxonomy} {input.combined} \ {input.krona_tax} {input.krona_ec} """ @@ -769,7 +772,7 @@ rule build_report: clean_logs = expand("logs/{sample}_decontamination.log", sample=config["samples"].keys()), merge_logs = expand("logs/{sample}_merge_sequences.log", sample=config["samples"].keys()), taxonomy = "summaries/taxonomy/order.txt", - zipped_files = "perseq_downloads.zip" + zipped_files = "perseq_downloads.tar.gz" output: "summary.html" conda: diff --git a/scripts/build_report.py b/scripts/build_report.py index e63bb73..496073d 100644 --- a/scripts/build_report.py +++ b/scripts/build_report.py @@ -1,4 +1,4 @@ - import argparse +import argparse import csv import os from collections import Counter, defaultdict @@ -53,7 +53,7 @@ def build_taxonomy_plot(txt, height=900): hierarchy = ["phylum", "class", "order"] df[hierarchy] = df[hierarchy].fillna("NA") df[samples] = df[samples].fillna(0) - df = df[hierarchy + value_cols] + df = df[hierarchy + samples] dfs = relatively.get_dfs_across_hierarchy( df, hierarchy, samples, reorder="shannon", dependent="; " ) @@ -122,19 +122,22 @@ def parse_classifications_for_taxonomy(path): # skip the header next(fh) for i, line in enumerate(fh, start=1): - toks = line.strip("\r\n").split("\t") - if toks[2]: - taxonomy_counter.update([toks[7]]) - taxonomy = [j.strip() for j in toks[7].split(";")] - taxonomy_level_counter["order"].update([taxonomy[3]]) - taxonomy_level_counter["class"].update([taxonomy[2]]) - taxonomy_level_counter["phylum"].update([taxonomy[1]]) - summary_counter.update(["Assigned Taxonomy"]) - if not all(s=='' for s in toks[4:12]): - summary_counter.update(["Assigned Both"]) + toks = line.strip("\n").split("\t") + try: + if toks[2]: + taxonomy_counter.update([toks[2]]) + taxonomy = [j.strip() for j in toks[2].split(";")] + taxonomy_level_counter["order"].update([taxonomy[3]]) + taxonomy_level_counter["class"].update([taxonomy[2]]) + taxonomy_level_counter["phylum"].update([taxonomy[1]]) + summary_counter.update(["Assigned Taxonomy"]) + if not all(s=='' for s in toks[4:12]): + summary_counter.update(["Assigned Both"]) + except: + print(len(line)) + continue if not all(s=='' for s in toks[4:12]): summary_counter.update(["Assigned Function"]) - return dict( taxonomy_level_counter=taxonomy_level_counter, summary_counter=summary_counter @@ -215,6 +218,7 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam "Unique", "Clean", ] + samp_df = pd.DataFrame.from_dict(classifications_per_sample) count_table = defaultdict(list) # initial read count, join count, join rate, insert size from merge step for merge_log in merge_logs: @@ -238,9 +242,12 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam count_table[sample].append(int(line.strip("\r\n").split("\t")[-1])) log_df = pd.DataFrame.from_dict(count_table, orient="index") + print(log_df.head(10)) + print(log_df.shape) + print(samp_df.shape) log_df.columns = header # print("classifications", classifications_per_sample.head()) - log_df = log_df.merge(classifications_per_sample, left_index=True, right_index=True) + log_df = log_df.merge(samp_df, left_index=True, right_index=True) # print(log_df.head()) log_df.reset_index(inplace=True) header.insert(0, "Sample") @@ -347,20 +354,25 @@ def main( r1_quality_files, html, conda_env, - function_table, + taxonomy_table, zipped_file ): clean_logs = glob(clean_logs) unique_logs = glob(unique_logs) merge_logs = glob(merge_logs) summary_table = summary_tables + # parse classifcations for summary + assigned_dict = parse_classifications_for_taxonomy(summary_table)["summary_counter"] r1_quality_files = glob(r1_quality_files) - #classifications_per_sample = compile_summary_df(summary_tables) + #classifications_per_sample = compile_summary_df(summary_table) #value_cols = get_sample_name(summary_tables, "_classifications.txt") fig = build_taxonomy_plot(taxonomy_table) plots = offline.plot(fig, **PLOTLY_PARAMS) + # html_tbl = parse_log_files( + # merge_logs, unique_logs, clean_logs, classifications_per_sample + # ) html_tbl = parse_log_files( - merge_logs, unique_logs, clean_logs, summary_table + merge_logs, unique_logs, clean_logs, assigned_dict ) quality_plot = build_quality_plot(r1_quality_files) conda_env = get_conda_env_str(conda_env) @@ -572,7 +584,7 @@ def main( p.add_argument("--r1-quality-files") p.add_argument("--html") p.add_argument("conda_env") - p.add_argument("function_table") + p.add_argument("taxonomy_table") p.add_argument("zipped_file") # p.add_argument("taxonomy_table") # p.add_argument("taxonomy_function_table") @@ -587,7 +599,7 @@ def main( args.r1_quality_files, args.html, args.conda_env, - args.function_table, + args.taxonomy_table, args.zipped_file, # args.taxonomy_table, # args.taxonomy_function_table, diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py index 46baf37..4721468 100755 --- a/scripts/summarize_classifications.py +++ b/scripts/summarize_classifications.py @@ -57,25 +57,28 @@ # logging.debug(f"Final table length: {len(df)}") # return df -def single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output): +def single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output): split_idx = TAX_LEVELS[tax_level] # group_one=["tigrfams","hamap","dbcan","kaiju"] - # for item in group_one: - hmm_cols = df.filter(like=group_on[0]).columns.values.tolist() - #TODO might not need - test_df = df[hmm_cols+samples].copy() - if "kaiju" in item: + # for item in group_on + #test_df.head() + if "kaiju_taxonomy" in group_on: + hmm_cols = df.filter(like="kaiju").columns.values.tolist() + test_df = df[hmm_cols+samples].copy() test_df = test_df[(df["kaiju_length"] > min_len)] - test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) + test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx])) test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() test_df = test_df.rename( columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} ) else: + hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist() + #TODO might not need + test_df = df[hmm_cols+samples].copy() metric_cols = test_df.filter(items=[group_on[0]+"_score",group_on[0]+"_evalue"]).columns.values.tolist() test_df = test_df[(test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)] test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head() - test_df.to_csv(item+output, sep="\t", index=False) + test_df.to_csv(output, sep="\t", index=False) def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output): @@ -86,8 +89,8 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist() #TODO might not need test_df = df[kaiju_cols+hmm_cols+samples].copy() - test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0]+"_score"] > min_score) & (test_df[group_on[0]+"_evalue"] < min_evalue)] - test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) + test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)] + test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx])) metric_cols = test_df.filter(items=[group_on[0].lower()+"_score",group_on[0].lower()+"_evalue"]).columns.values.tolist() metric_cols.append("kaiju_length") test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index() @@ -98,7 +101,7 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o def main(output, table, tax_level, min_evalue, min_score, min_len,group_on): - df = pd.read_table(table) + df = pd.read_table(table,low_memory=False) # remove the rows w/ no information df = df.dropna( subset=[ @@ -110,9 +113,9 @@ def main(output, table, tax_level, min_evalue, min_score, min_len,group_on): thresh=1) samples = df.filter(regex="[0-9]+").columns.values.tolist() if len(group_on) > 1: - combined_output(df,tax_level,min_evalue, min_score, min_len, group_on,output) + combined_output(df,tax_level,samples,min_evalue, min_score, min_len, group_on,output) else: - single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output) + single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output) # split_idx = TAX_LEVELS[tax_level] # group_one=["tigrfams","hamap","dbcan","kaiju"] # for item in group_one: @@ -180,7 +183,7 @@ def main(output, table, tax_level, min_evalue, min_score, min_len,group_on): ) parser.add_argument( "--min-evalue", - type=int, + type=float, default=0.001, help="lowest evalue to retain per sequence per sample", ) From 848e36889b50ba5ef690fc013f424a56fb3501b3 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 4 Dec 2018 10:53:13 -0800 Subject: [PATCH 08/14] fixed cleaning annotaitons --- Snakefile | 2 +- scripts/build_report.py | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index e2835ca..ff8a699 100644 --- a/Snakefile +++ b/Snakefile @@ -632,7 +632,7 @@ rule remove_empty_annotations: print(header.strip("\n"), file=output) for line in file: toks = line.strip("\r\n").split("\t") - if not all(s=='' for s in toks[4:12]): + if not all(s=='' for s in toks[2:12]): print(line.strip("\n"), file=output) rule build_hmms_table: diff --git a/scripts/build_report.py b/scripts/build_report.py index 496073d..18fb71f 100644 --- a/scripts/build_report.py +++ b/scripts/build_report.py @@ -142,6 +142,39 @@ def parse_classifications_for_taxonomy(path): taxonomy_level_counter=taxonomy_level_counter, summary_counter=summary_counter ) +def parse_files_for_annotation(path): + """ + parses the annotations to retain which samples were assigned function vs + taxonomy vs both + """ + summary_counter={} + with open(path) as fh: + # skip the header + header = next(fh) + header = header.strip("\n").split("\t") + for sample in header[18:]: + summary_counter[sample]=Counter() + for line in fh: + toks = line.strip("\n").split("\t") + try: + if toks[2]: + for i,sample in enumerate(header[18:]): + if toks[18+i] != 0: + summary_counter[sample].update(["Assigned Taxonomy"]) + summary_counter.update() + if not all(s=='' for s in toks[3:12]): + for i,sample in enumerate(header[18:]): + if toks[18+i] != 0: + summary_counter[sample].update(["Assigned Both"]) + #summary_counter.update(["Assigned Both"]) + except: + print(len(line)) + continue + if not all(s=='' for s in toks[4:12]): + for i,sample in enumerate(header[18:]): + if toks[18+i] != 0: + summary_counter[sample].update(["Assigned Function"]) + return summary_counter def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]): """ @@ -218,7 +251,7 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam "Unique", "Clean", ] - samp_df = pd.DataFrame.from_dict(classifications_per_sample) + samp_df = pd.DataFrame.from_dict(classifications_per_sample,orient = "index") count_table = defaultdict(list) # initial read count, join count, join rate, insert size from merge step for merge_log in merge_logs: @@ -362,7 +395,7 @@ def main( merge_logs = glob(merge_logs) summary_table = summary_tables # parse classifcations for summary - assigned_dict = parse_classifications_for_taxonomy(summary_table)["summary_counter"] + assigned_dict = parse_files_for_annotation(summary_table) r1_quality_files = glob(r1_quality_files) #classifications_per_sample = compile_summary_df(summary_table) #value_cols = get_sample_name(summary_tables, "_classifications.txt") From 55284a6186367561909e039e8f3e9ca9860f94ac Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 4 Dec 2018 13:50:10 -0800 Subject: [PATCH 09/14] fixed summary table metrics --- scripts/build_report.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/build_report.py b/scripts/build_report.py index 18fb71f..daadd40 100644 --- a/scripts/build_report.py +++ b/scripts/build_report.py @@ -159,12 +159,12 @@ def parse_files_for_annotation(path): try: if toks[2]: for i,sample in enumerate(header[18:]): - if toks[18+i] != 0: + if str(toks[18+i]) != 0: summary_counter[sample].update(["Assigned Taxonomy"]) summary_counter.update() if not all(s=='' for s in toks[3:12]): for i,sample in enumerate(header[18:]): - if toks[18+i] != 0: + if str(toks[18+i]) != 0: summary_counter[sample].update(["Assigned Both"]) #summary_counter.update(["Assigned Both"]) except: @@ -172,7 +172,7 @@ def parse_files_for_annotation(path): continue if not all(s=='' for s in toks[4:12]): for i,sample in enumerate(header[18:]): - if toks[18+i] != 0: + if str(toks[18+i]) != 0: summary_counter[sample].update(["Assigned Function"]) return summary_counter From f2e164003486b7d0aa4307b998f14681aafaadc4 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Tue, 4 Dec 2018 15:41:25 -0800 Subject: [PATCH 10/14] updates reltaively version --- envs/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/environment.yml b/envs/environment.yml index c8977a0..1ea37c7 100644 --- a/envs/environment.yml +++ b/envs/environment.yml @@ -19,5 +19,5 @@ dependencies: - seqtk=1.3 - pip - pip: - - relatively + - relatively>=1.0.5 - plotly>=3.0 From 83c64f8905a94bd72d98dd2fe440e2a0327f5e7d Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Wed, 5 Dec 2018 09:38:12 -0800 Subject: [PATCH 11/14] remove temp on zipped file --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index ff8a699..4fe7acc 100644 --- a/Snakefile +++ b/Snakefile @@ -753,7 +753,7 @@ rule zip_attachments: krona_tax = "krona_plots/tax.krona.html", krona_ec = "krona_plots/ec.krona.html" output: - temp("perseq_downloads.tar.gz") + "perseq_downloads.tar.gz" conda: CONDAENV shell: From ff90f7546f657683e8d94aef1a9113e99308eab9 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Wed, 5 Dec 2018 09:38:39 -0800 Subject: [PATCH 12/14] fixes column names --- scripts/build_report.py | 191 ++++++++++++++++++++++++---------------- 1 file changed, 114 insertions(+), 77 deletions(-) diff --git a/scripts/build_report.py b/scripts/build_report.py index daadd40..f5d4fb2 100644 --- a/scripts/build_report.py +++ b/scripts/build_report.py @@ -47,7 +47,7 @@ def get_sample_order(lst): def build_taxonomy_plot(txt, height=900): df = pd.read_table(txt) - samples = df.filter(regex="[0-9]+").columns.values.tolist() + samples = df.columns.values.tolist()[1:] levels = ["kingdom", "phylum", "class", "order"] df[levels] = df["taxonomy_order"].str.split(";", expand=True) hierarchy = ["phylum", "class", "order"] @@ -160,12 +160,12 @@ def parse_files_for_annotation(path): if toks[2]: for i,sample in enumerate(header[18:]): if str(toks[18+i]) != 0: - summary_counter[sample].update(["Assigned Taxonomy"]) - summary_counter.update() - if not all(s=='' for s in toks[3:12]): - for i,sample in enumerate(header[18:]): - if str(toks[18+i]) != 0: - summary_counter[sample].update(["Assigned Both"]) + summary_counter[sample].update(["Kaiju+NR"]) + #summary_counter.update() + # if not all(s=='' for s in toks[3:12]): + # for i,sample in enumerate(header[18:]): + # if str(toks[18+i]) != 0: + # summary_counter[sample].update(["Assigned Both"]) #summary_counter.update(["Assigned Both"]) except: print(len(line)) @@ -173,43 +173,43 @@ def parse_files_for_annotation(path): if not all(s=='' for s in toks[4:12]): for i,sample in enumerate(header[18:]): if str(toks[18+i]) != 0: - summary_counter[sample].update(["Assigned Function"]) + summary_counter[sample].update(["TIGRFAMs,HAMAP,dbCAN\n hits per sample"]) return summary_counter -def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]): - """ - Reads in multiple sample alignments from diamond in a given directory and merges them into - a single pandas.DataFrame. It returns a pandas dataframe for each of - phylum that is ready to plug into the processing function. Also returns total counts - which is necessary to calculate the percentage of total that is being represented - """ - dfs = {} - classifications_per_sample = {} - cols = df.filter(regex="[A-Z]+").columns.values.tolist() - samples = df.filter(regex="[0-9]+").columns.values.tolist() - for sample in samples: - df[cols] - - for classification_table in classification_tables: - sample = get_sample(classification_table, "_classifications.txt") - parsed_taxonomy = parse_classifications_for_taxonomy(classification_table) - # assigned #'s in summary table - classifications_per_sample[sample] = parsed_taxonomy["summary_counter"] - if len(dfs) == 0: - for tax_level in tax_levels: - dfs[tax_level] = get_df_at_tax_level( - parsed_taxonomy["taxonomy_level_counter"][tax_level], - sample, - tax_level, - ) - continue - - for tax_level in tax_levels: - df = get_df_at_tax_level( - parsed_taxonomy["taxonomy_level_counter"][tax_level], sample, tax_level - ) - dfs[tax_level] = dfs[tax_level].merge(df, on=tax_level, how="outer") - return pd.DataFrame.from_dict(classifications_per_sample, orient="index") +# def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]): +# """ +# Reads in multiple sample alignments from diamond in a given directory and merges them into +# a single pandas.DataFrame. It returns a pandas dataframe for each of +# phylum that is ready to plug into the processing function. Also returns total counts +# which is necessary to calculate the percentage of total that is being represented +# """ +# dfs = {} +# classifications_per_sample = {} +# cols = df.filter(regex="[A-Z]+").columns.values.tolist() +# samples = df.filter(regex="[0-9]+").columns.values.tolist() +# for sample in samples: +# df[cols] +# +# for classification_table in classification_tables: +# sample = get_sample(classification_table, "_classifications.txt") +# parsed_taxonomy = parse_classifications_for_taxonomy(classification_table) +# # assigned #'s in summary table +# classifications_per_sample[sample] = parsed_taxonomy["summary_counter"] +# if len(dfs) == 0: +# for tax_level in tax_levels: +# dfs[tax_level] = get_df_at_tax_level( +# parsed_taxonomy["taxonomy_level_counter"][tax_level], +# sample, +# tax_level, +# ) +# continue +# +# for tax_level in tax_levels: +# df = get_df_at_tax_level( +# parsed_taxonomy["taxonomy_level_counter"][tax_level], sample, tax_level +# ) +# dfs[tax_level] = dfs[tax_level].merge(df, on=tax_level, how="outer") +# return pd.DataFrame.from_dict(classifications_per_sample, orient="index") def get_df_at_tax_level(count_obj, sample, tax_level): @@ -472,33 +472,54 @@ def main( specified during the decontamination process. Functional annotation and taxonomic classification were performed following the decontamination step. +Translation and Clustering +************************** + +Raw nucleotide sequences are translated into protein sequences using Prodigal[4] +in anonymous mode. These sequences are then clustered at a 90% sequence identity +using mmseqs[5], to decrease the computational resources required to perform +annotation. These representative sequences are then annotated for functional +and taxonomic information. + + Functional Annotation ********************* -The blastx algorithm of DIAMOND [4] was used to align nucleotide sequences to -the KEGG protein reference database [5] consisting of non-redundant, family -level fungal eukaryotes and genus level prokaryotes -(``--strand=both --evalue 0.00001``). The highest scoring alignment per -sequence was used for functional annotation. +There are three distinct hmm libraries used to infer functional potential: +HAMAP[6], dbCAN[7], and TIRGRFAMs[8]. The alignment of each seuqnce to each database +was performed using HMMSearch of HMMer[9]. TIGRFAMs and HAMAP both provide +enzyme commission (EC), gene and product, while dbCAN2 provides enzyme class, +enzyme subclass and EC. Taxonomic Annotation ******************** Kmer-based taxonomic classification was performed on the merged reads using -Kaiju [6] in greedy mode (``-a greedy -E 0.05``). NCBI's nr database [7] +Kaiju [10] in greedy mode (``-a greedy -E 0.05``). NCBI's nr database [11] containing reference sequences for archaea, bacteria, viruses, fungi, and microbial eukaryotes was used as the reference index for Kaiju. +Aggregation +*********** + +Once functional and taxonomic annotation has completed all of the sequences per +sample are aligned to the now annotated representative sequences using DIAMOND[12]. + References ********** 1. Rognes T, Flouri T, Nichols B, Quince C, Mahé F. VSEARCH: a versatile open source tool for metagenomics. PeerJ. PeerJ Inc; 2016;4:e2584. 2. Bushnell B. BBTools [Internet]. Available from: https://sourceforge.net/projects/bbmap/ 3. Li H. Seqtk [Internet]. Available from: https://github.com/lh3/seqtk -4. Buchfink B, Xie C, Huson DH. Fast and sensitive protein alignment using DIAMOND. Nat. Methods. Nature Publishing Group; 2015;12:59–60. -5. Kanehisa M, Sato Y, Kawashima M, Furumichi M, Tanabe M. KEGG as a reference resource for gene and protein annotation. Nucleic Acids Res. 2016;44:D457–62. -6. Menzel P, Ng KL, Krogh A. Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nat Commun. Nature Publishing Group; 2016;7:11257. -7. NCBI Resource Coordinators. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2018;46:D8–D13. +4. fd +5. jdkf +6. DFRA_DIACA +7. DFRA_DIACA +8. def +9. dk +10. Menzel P, Ng KL, Krogh A. Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nat Commun. Nature Publishing Group; 2016;7:11257. +11. NCBI Resource Coordinators. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2018;46:D8–D13. +12. Buchfink B, Xie C, Huson DH. Fast and sensitive protein alignment using DIAMOND. Nat. Methods. Nature Publishing Group; 2015;12:59–60. Execution Environment @@ -511,10 +532,10 @@ def main( Output ------ -Classification Tables +Annotations Table ********************* -Per sample classifications in tables/ contain: +Table containing annotations across all samples: .. table:: :name: classificationtable @@ -522,8 +543,22 @@ def main( ========================= ========================================================================================================================================== Header ID Definition ========================= ========================================================================================================================================== - aa_alignment_length The length of the DIAMOND blastx hit - aa_percent_id The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency + seq + kaiju_length + kaiju_taxonomy + tigrfams_ec + tigrfams_gene + tigrfams_product + tigrfams_score + tigrfams_evalue + hamap_ec + hamap_gene + hamap_product + hamap_score + hamap_evalue + + _ec The length of the DIAMOND blastx hit + _gene The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency ec Enzyme Commission number from KEGG; semicolon delimited where multiple ko KEGG entry ID product KEGG gene ID KEGG product @@ -555,41 +590,43 @@ def main( Function ```````` -Per function assignments in tables named **summaries/function/.txt** +Per function assignments in tables named **summaries/hmms_summary/.txt** contain: .. table:: :name: functiondeftable - ==================== =================================================================== - Header ID Definition - ==================== =================================================================== - either KO, EC, or product into which counts have been summed - samples names non-normalized, per sample sum for this particular functional group - level_1 KEGG hierarchy [level 1] if KO defined in first column - level_2 KEGG hierarchy [level 2] if KO defined in first column - level_3 KEGG hierarchy [level 3] if KO defined in first column - ==================== =================================================================== + ==================== =================================================================== + Header ID Definition + ==================== =================================================================== + _ec either HAMAP,dbCAN or TIGRFAMs EC + _gene either HAMAP,dbCAN or TIGRFAMs gene + _product either HAMAP,dbCAN or TIGRFAMs product + _enzyme_class dbcan only enzyme class descriptor + _enzyme_class_subfamily dbcan only enzyme class subfamily descriptor + samples names non-normalized, per sample sum for this particular functional group + ==================== =================================================================== Combined ```````` Per taxonomy+function assignments in tables named -**summaries/combined/_.txt** contain: +**summaries/combined/_.txt** contain: .. table:: :name: combineddeftable - ==================== ==================================================================== - Header ID Definition - ==================== ==================================================================== - either KO, EC, or product; counts are summed using + - taxonomy_ taxonomic level; counts are summed using + - sample names non-normalized, per sample sum for this particular functional group - level_1 KEGG hierarchy [level 1] if KO defined in first column - level_2 KEGG hierarchy [level 2] if KO defined in first column - level_3 KEGG hierarchy [level 3] if KO defined in first column - ==================== ==================================================================== + ==================== ==================================================================== + Header ID Definition + ==================== ==================================================================== + _ec either HAMAP,dbCAN or TIGRFAMs EC + _gene either HAMAP,dbCAN or TIGRFAMs gene + _product either HAMAP,dbCAN or TIGRFAMs product + _enzyme_class dbcan only enzyme class descriptor + _enzyme_class_subfamily dbcan only enzyme class subfamily descriptor + taxonomy_ taxonomic level; counts are summed using + + sample names non-normalized, per sample sum for this particular functional group + ==================== ==================================================================== Downloads --------- From c82b642537be26404144ccabfa35e52fb39ce89c Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Wed, 5 Dec 2018 09:39:24 -0800 Subject: [PATCH 13/14] fixes combined function --- scripts/summarize_classifications.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py index 4721468..8853d67 100755 --- a/scripts/summarize_classifications.py +++ b/scripts/summarize_classifications.py @@ -77,7 +77,7 @@ def single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,out test_df = df[hmm_cols+samples].copy() metric_cols = test_df.filter(items=[group_on[0]+"_score",group_on[0]+"_evalue"]).columns.values.tolist() test_df = test_df[(test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)] - test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head() + test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index() test_df.to_csv(output, sep="\t", index=False) @@ -87,11 +87,14 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o kaiju_cols = df.filter(like="kaiju").columns.values.tolist() # for item in group_one: hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist() + print(samples) + print(group_on[0]) #TODO might not need test_df = df[kaiju_cols+hmm_cols+samples].copy() test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)] test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx])) metric_cols = test_df.filter(items=[group_on[0].lower()+"_score",group_on[0].lower()+"_evalue"]).columns.values.tolist() + print(metric_cols) metric_cols.append("kaiju_length") test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index() test_df = test_df.rename( @@ -102,16 +105,17 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o def main(output, table, tax_level, min_evalue, min_score, min_len,group_on): df = pd.read_table(table,low_memory=False) - # remove the rows w/ no information - df = df.dropna( - subset=[ - "kaiju_taxonomy", "tigrfams_ec", "tigrfams_gene", "tigrfams_product", + col_names = [ + "seq","kaiju_length","kaiju_taxonomy", "tigrfams_ec", "tigrfams_gene", "tigrfams_product", "tigrfams_score", "tigrfams_evalue", "hamap_ec", "hamap_gene", "hamap_product", "hamap_score", "hamap_evalue", "dbcan_ec", - "dbcan_enzyme_class", "dbcan_enzyme_class_subfamily" - ], - thresh=1) - samples = df.filter(regex="[0-9]+").columns.values.tolist() + "dbcan_enzyme_class", "dbcan_enzyme_class_subfamily","dbcan_score", "dbcan_evalue" + ] + # remove the rows w/ no information + # df = df.dropna( + # subset= col_names, + # thresh=2) + samples = [c for c in df.columns.values.tolist() if c not in col_names] if len(group_on) > 1: combined_output(df,tax_level,samples,min_evalue, min_score, min_len, group_on,output) else: From 17ff8e755926d3aa1b33422b4b736ac791e57e84 Mon Sep 17 00:00:00 2001 From: nzavoshy Date: Thu, 6 Dec 2018 14:07:48 -0800 Subject: [PATCH 14/14] fixes assigned columns --- scripts/build_report.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/build_report.py b/scripts/build_report.py index f5d4fb2..bb45b17 100644 --- a/scripts/build_report.py +++ b/scripts/build_report.py @@ -159,7 +159,7 @@ def parse_files_for_annotation(path): try: if toks[2]: for i,sample in enumerate(header[18:]): - if str(toks[18+i]) != 0: + if int(toks[18+i]) != 0: summary_counter[sample].update(["Kaiju+NR"]) #summary_counter.update() # if not all(s=='' for s in toks[3:12]): @@ -172,7 +172,7 @@ def parse_files_for_annotation(path): continue if not all(s=='' for s in toks[4:12]): for i,sample in enumerate(header[18:]): - if str(toks[18+i]) != 0: + if int(toks[18+i]) != 0: summary_counter[sample].update(["TIGRFAMs,HAMAP,dbCAN\n hits per sample"]) return summary_counter @@ -279,12 +279,14 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam print(log_df.shape) print(samp_df.shape) log_df.columns = header + print(log_df.head()) # print("classifications", classifications_per_sample.head()) log_df = log_df.merge(samp_df, left_index=True, right_index=True) + print(log_df.head()) # print(log_df.head()) log_df.reset_index(inplace=True) header.insert(0, "Sample") - header.extend(["Assigned\nFunction", "Assigned\nTaxonomy", "Assigned\nBoth"]) + header.extend(["TIGRFAMs,HAMAP,dbCAN\n hits per sample", "Kaiju+NR"]) log_df.columns = header sample_summary_table = log_df[header].to_html( index=False, @@ -533,7 +535,7 @@ def main( ------ Annotations Table -********************* +***************** Table containing annotations across all samples: @@ -556,7 +558,6 @@ def main( hamap_product hamap_score hamap_evalue - _ec The length of the DIAMOND blastx hit _gene The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency ec Enzyme Commission number from KEGG; semicolon delimited where multiple