diff --git a/Snakefile b/Snakefile index 463699e..4fe7acc 100644 --- a/Snakefile +++ b/Snakefile @@ -136,12 +136,12 @@ def get_summaries(): hmm = ["TIGRFAMs","HAMAP","dbCAN"] taxonomy = ["phylum", "class", "order"] # code to generate the possible files - file_paths = expand("summaries/combined/{hmm}_{taxonomy}.txt", - hmm=hmm, taxonomy=taxonomy) + file_paths = expand("summaries/combined/{hmm}_{tax_classification}.txt", + hmm=hmm, tax_classification=taxonomy) file_paths.extend(expand("summaries/hmms_summary/{hmm}_summary.txt", hmm=hmm)) - file_paths.extend(expand("summaries/taxonomy/{taxonomy}.txt", - taxonomy=taxonomy)) + file_paths.extend(expand("summaries/taxonomy/{tax_classification}.txt", + tax_classification=taxonomy)) return file_paths @@ -394,7 +394,7 @@ rule aggregate_all_genes: with open(f) as fh: for name, seq, _ in readfx(fh): # per sequence, choose first only - if name.endswith("_2"): + if not name.endswith("_1"): continue print(">%d" % name_index, seq.replace("*", ""), sep="\n", file=out_faa) name_index += 1 @@ -481,7 +481,7 @@ rule split_fasta: input: fasta = "gene_catalog/clustered_genes.faa" output: - temp((dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa"))) + dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa") params: # consider a smaller chunk size chunk_size = config.get("chunk_size", 1000000) @@ -621,9 +621,23 @@ rule combine_sample_output: """ +rule remove_empty_annotations: + input: + annotations = "gene_catalog/annotations.txt" + output: + cleaned = "gene_catalog/annotations_cleaned.txt" + run: + with open(input.annotations) as file, open(output.cleaned,"w") as output: + header = next(file) + print(header.strip("\n"), file=output) + for line in file: + toks = line.strip("\r\n").split("\t") + if not all(s=='' for s in toks[2:12]): + print(line.strip("\n"), file=output) + rule build_hmms_table: input: - "gene_catalog/annotations.txt" + "gene_catalog/annotations_cleaned.txt" output: "summaries/hmms_summary/{hmm}_summary.txt" params: @@ -645,7 +659,7 @@ rule build_hmms_table: rule build_tax_table: input: - "gene_catalog/annotations.txt" + "gene_catalog/annotations_cleaned.txt" output: "summaries/taxonomy/{tax_classification}.txt" params: @@ -659,7 +673,7 @@ rule build_tax_table: shell: """ python scripts/summarize_classifications.py \ - --group-on kaiju_classification --min-evalue {params.min_evalue} \ + --group-on kaiju_taxonomy --min-evalue {params.min_evalue} \ --min-score {params.min_score} --min-len {params.min_len} \ --tax-level {wildcards.tax_classification} {output} {input} """ @@ -667,7 +681,7 @@ rule build_tax_table: rule build_hmm_and_tax_table: input: - "gene_catalog/annotations.txt" + "gene_catalog/annotations_cleaned.txt" output: "summaries/combined/{hmm}_{tax_classification}.txt" params: @@ -679,12 +693,12 @@ rule build_hmm_and_tax_table: shell: """ python scripts/summarize_classifications.py \ - --group-on {wildcards.hmm} kaiju_classification \ + --group-on {wildcards.hmm} kaiju_taxonomy \ --tax-level {wildcards.tax_classification} --min-evalue {params.min_evalue} \ - --min-score {params.min_score}--min-len {params.min_len} {output} {input} + --min-score {params.min_score} --min-len {params.min_len} {output} {input} """ -## TODO fix this rule + rule build_krona_ec_input: input: ec_file = "summaries/hmms_summary/TIGRFAMs_summary.txt", @@ -731,30 +745,34 @@ rule build_krona_plots: """ -# rule zip_attachments: -# input: -# function = "summaries/function/ec.txt", -# taxonomy = "summaries/taxonomy/order.txt", -# krona_tax = "krona_plots/tax.krona.html", -# krona_ec = "krona_plots/ec.krona.html" -# output: -# temp("perseq_downloads.zip") -# shell: -# """ -# zip {output} {input.function} {input.taxonomy} {input.combined} -# """ +rule zip_attachments: + input: + function = "summaries/hmms_summary/TIGRFAMs_summary.txt", + taxonomy = "summaries/taxonomy/order.txt", + combined = "summaries/combined/TIGRFAMs_order.txt", + krona_tax = "krona_plots/tax.krona.html", + krona_ec = "krona_plots/ec.krona.html" + output: + "perseq_downloads.tar.gz" + conda: + CONDAENV + shell: + """ + tar -czf {output} {input.function} {input.taxonomy} {input.combined} \ + {input.krona_tax} {input.krona_ec} + """ rule build_report: input: - annotations = "gene_catalog/annotations.txt", + annotations = "gene_catalog/annotations_cleaned.txt", ee_stats = expand("logs/{sample}_{idx}_eestats.txt", sample=config["samples"].keys(), idx=["R1", "R2"]), clean_length_logs = expand("logs/{sample}_03_clean_readlengths.txt", sample=config["samples"].keys()), unique_length_logs = expand("logs/{sample}_02_unique_readlengths.txt", sample=config["samples"].keys()), clean_logs = expand("logs/{sample}_decontamination.log", sample=config["samples"].keys()), merge_logs = expand("logs/{sample}_merge_sequences.log", sample=config["samples"].keys()), taxonomy = "summaries/taxonomy/order.txt", - zipped_files = "perseq_downloads.zip" + zipped_files = "perseq_downloads.tar.gz" output: "summary.html" conda: diff --git a/envs/environment.yml b/envs/environment.yml index c8977a0..1ea37c7 100644 --- a/envs/environment.yml +++ b/envs/environment.yml @@ -19,5 +19,5 @@ dependencies: - seqtk=1.3 - pip - pip: - - relatively + - relatively>=1.0.5 - plotly>=3.0 diff --git a/scripts/build_krona.py b/scripts/build_krona.py index 6222eed..0fc5642 100755 --- a/scripts/build_krona.py +++ b/scripts/build_krona.py @@ -17,11 +17,10 @@ def build_tax(tax_file, output): # read in tax file to order level from build_tax_table rule test_tax = pd.read_table(tax_file) col_names = list(test_tax)[1:] - taxonomies = ["Kingdom", "Phylum", "Class", "Order"] # split taxonomy column into hierarchy test_tax[ ["Kingdom", "Phylum", "Class", "Order"] - ] = test_tax.taxonomy_order.str.split("; ", expand=True) + ] = test_tax.taxonomy_order.str.split("; ",n=4, expand=True) # generate individual files per sample for item in col_names: df_name = f"test_tax_{item}" @@ -85,62 +84,96 @@ def build_ec_dict(ec_filename, dat_filename): return d +# def parse_ec_file(ec_file_from_summaries): +# """ +# Split sequences that aligned to multiple ec numbers into last common ec number +# shared amongst matches +# """ +# with gzopen(ec_file_from_summaries) as ec_file_sum: +# +# next(ec_file_sum) +# new_ec_dict = {"ec": []} +# # split ec numbers that mapped to multiple +# for item in ec_file_sum: +# toks = item.partition("\t")[0] +# ec = toks.split(";") +# # print(ec) +# first_item = ec[0][:5] +# if len(ec) == 1: +# new_ec_dict["ec"].append(ec[0]) +# else: +# n = 0 +# for item in ec[1:]: +# if item[:5] == first_item: +# new_ec_dict["ec"].append(item[:5] + ".-") +# # print('same',item,first_item) +# elif item[:3] == first_item[:3]: +# new_ec_dict["ec"].append(item[:3] + ".-.-") +# # print('same to high level',item,first_item) +# elif item[:1] == first_item[:1]: +# new_ec_dict["ec"].append(item[:1] + ".-.-.-") +# return new_ec_dict + def parse_ec_file(ec_file_from_summaries): """ - Split sequences that aligned to multiple ec numbers into last common ec number - shared amongst matches + For ec numbers that aligned to multiple ecs in TIGRFAMs, the first entry is + the only one that will be retained for Krona plots. """ - with gzopen(ec_file_from_summaries) as ec_file_sum: - - next(ec_file_sum) - new_ec_dict = {"ec": []} - # split ec numbers that mapped to multiple - for item in ec_file_sum: - toks = item.partition("\t")[0] - ec = toks.split(";") - # print(ec) - first_item = ec[0][:5] - if len(ec) == 1: - new_ec_dict["ec"].append(ec[0]) - else: - n = 0 - for item in ec[1:]: - if item[:5] == first_item: - new_ec_dict["ec"].append(item[:5] + ".-") - # print('same',item,first_item) - elif item[:3] == first_item[:3]: - new_ec_dict["ec"].append(item[:3] + ".-.-") - # print('same to high level',item,first_item) - elif item[:1] == first_item[:1]: - new_ec_dict["ec"].append(item[:1] + ".-.-.-") - return new_ec_dict - - -def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output): + ec_df = pd.read_table("ec_file_from_summaries") + samples = ec_df.filter(regex="[0-9]+").columns.values.tolist() + ec_df = ec_df[["tigrfams_ec"]+samples] + ec_df = ec_df.groupby(["tigrfams_ec"]).sum(axis=1).reset_index() + ec_df[["tigrfams_ec"]] = ec_df.tigrfams_ec.str.split(",",n=1).str[0] + return ec_df + + +# def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output): +# """ +# Splits the ec number counts and hierarchy by sample +# """ +# #ec_replace = pd.DataFrame.from_dict(new_ec_dict) +# ec_table = pd.read_table(ec_file_from_summaries) +# col_names = list(ec_table)[1:] +# #ec_table["ec"] = ec_replace["ec"] +# ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index() +# ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"}) +# # print(ec_hier_dict.head(5)) +# grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner") +# # print(grouped_ec_tbl.head(5)) +# grouped_ec_tbl = grouped_ec_tbl.rename( +# columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"} +# ) +# # split the samples into separate files +# for item in col_names: +# df_name = f"test_tax_{item}" +# df_name = grouped_ec_tbl[ +# [f"{item}", "level_1", "level_2", "level_3", "level_4"] +# ] +# +# df_name.to_csv( +# output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False +# ) +def build_ec_output(ec_df, parsed_dict,output): """ Splits the ec number counts and hierarchy by sample """ - ec_replace = pd.DataFrame.from_dict(new_ec_dict) - ec_table = pd.read_table(ec_file_from_summaries) - col_names = list(ec_table)[1:] - ec_table["ec"] = ec_replace["ec"] + ec_table = ec_df + samples = list(ec_table)[1:] ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index() - ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"}) - # print(ec_hier_dict.head(5)) - grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner") - # print(grouped_ec_tbl.head(5)) + ec_hier_dict = ec_hier_dict.rename(columns={"index": "tigrfams_ec"}) + grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="tigrfams_ec", how="inner") grouped_ec_tbl = grouped_ec_tbl.rename( columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"} ) # split the samples into separate files - for item in col_names: - df_name = f"test_tax_{item}" + for sample in samples: + #df_name = f"test_tax_{sample}" df_name = grouped_ec_tbl[ - [f"{item}", "level_1", "level_2", "level_3", "level_4"] + [f"{sample}", "level_1", "level_2", "level_3", "level_4"] ] - + print(df_name.head()) df_name.to_csv( - output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False + output + "/" + f"{sample}" + "_ec.txt", sep="\t", index=False, header=False ) @@ -148,9 +181,9 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file): if tax_file is not None: build_tax(tax_file, output) else: - new_ec_dict = parse_ec_file(ec_file_from_summaries) + ec_df = parse_ec_file(ec_file_from_summaries) parsed_dict = build_ec_dict(ec_file, dat_file) - build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output) + build_ec_output(ec_df, parsed_dict, output) if __name__ == "__main__": @@ -167,7 +200,7 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file): p.add_argument( "--ec-file-from-summaries", type=str, - help="file that contains the ec numbers from the diamond alignment", + help="file that contains the ec numbers from TIGRFAMs summary", ) p.add_argument( "--dat-file", diff --git a/scripts/build_report.py b/scripts/build_report.py index ace0eaa..bb45b17 100644 --- a/scripts/build_report.py +++ b/scripts/build_report.py @@ -45,16 +45,17 @@ def get_sample_order(lst): return [i[0] for i in sorted(lst, key=lambda x: x[1], reverse=True)] -def build_taxonomy_plot(txt, value_cols, height=900): +def build_taxonomy_plot(txt, height=900): df = pd.read_table(txt) + samples = df.columns.values.tolist()[1:] levels = ["kingdom", "phylum", "class", "order"] df[levels] = df["taxonomy_order"].str.split(";", expand=True) hierarchy = ["phylum", "class", "order"] df[hierarchy] = df[hierarchy].fillna("NA") - df[value_cols] = df[value_cols].fillna(0) - df = df[hierarchy + value_cols] + df[samples] = df[samples].fillna(0) + df = df[hierarchy + samples] dfs = relatively.get_dfs_across_hierarchy( - df, hierarchy, value_cols, reorder="shannon", dependent="; " + df, hierarchy, samples, reorder="shannon", dependent="; " ) fig = relatively.get_abundance_figure_from_dfs( dfs, hierarchy, "Assigned Taxonomy Per Sample", height=height @@ -66,11 +67,47 @@ def get_sample_name(path, key): return [os.path.basename(item).partition(key)[0] for item in path] +# def parse_classifications_for_taxonomy(path): +# """ +# SN1035:381:h3cv7bcx2:2:2202:3165:73750 -1 -1 +# SN1035:381:h3cv7bcx2:1:1105:7280:88523 67.5 40 111 Archaea; Thaumarchaeota; Nitrososphaerales; Nitrososphaeria; Nitrososphaeraceae; Candidatus Nitrosocosmicus; Candidatus Nitrocosmicus oleophilus; +# SN1035:381:h3cv7bcx2:1:1204:20140:83036 78.4 37 ko:K00820 glmS, GFPT; glucosamine---fructose-6-phosphate aminotransferase (isomerizing) 2.6.1.16 229 Bacteria; Actinobacteria; NA; NA; NA; NA; Actinobacteria bacterium 13_1_40CM_66_12; +# """ +# logger.info("Parsing {}".format(path)) +# # hardcoded tax levels :( +# taxonomy_level_counter = { +# "order": Counter(), +# "class": Counter(), +# "phylum": Counter(), +# } +# taxonomy_counter = Counter() +# summary_counter = Counter() +# with open(path) as fh: +# # skip the header +# next(fh) +# for i, line in enumerate(fh, start=1): +# toks = line.strip("\r\n").split("\t") +# if toks[3]: +# summary_counter.update(["Assigned Function"]) +# if toks[7]: +# summary_counter.update(["Assigned Both"]) +# if toks[7]: +# taxonomy_counter.update([toks[7]]) +# taxonomy = [j.strip() for j in toks[7].split(";")] +# taxonomy_level_counter["order"].update([taxonomy[3]]) +# taxonomy_level_counter["class"].update([taxonomy[2]]) +# taxonomy_level_counter["phylum"].update([taxonomy[1]]) +# summary_counter.update(["Assigned Taxonomy"]) +# return dict( +# taxonomy_level_counter=taxonomy_level_counter, +# summary_counter=summary_counter +# ) + def parse_classifications_for_taxonomy(path): """ - SN1035:381:h3cv7bcx2:2:2202:3165:73750 -1 -1 - SN1035:381:h3cv7bcx2:1:1105:7280:88523 67.5 40 111 Archaea; Thaumarchaeota; Nitrososphaerales; Nitrososphaeria; Nitrososphaeraceae; Candidatus Nitrosocosmicus; Candidatus Nitrocosmicus oleophilus; - SN1035:381:h3cv7bcx2:1:1204:20140:83036 78.4 37 ko:K00820 glmS, GFPT; glucosamine---fructose-6-phosphate aminotransferase (isomerizing) 2.6.1.16 229 Bacteria; Actinobacteria; NA; NA; NA; NA; Actinobacteria bacterium 13_1_40CM_66_12; + 37056 274 Bacteria; Bacteroidetes; Sphingobacteriia; Sphingobacteriales; NA; NA; Sphingobacteriales bacterium UTBCD1; 3.4.21.92 clpP ATP-dependent Clp endopeptidase proteolytic subun + 75331 0 0 0 0 0 0 0 0 0 + """ logger.info("Parsing {}".format(path)) # hardcoded tax levels :( @@ -85,53 +122,94 @@ def parse_classifications_for_taxonomy(path): # skip the header next(fh) for i, line in enumerate(fh, start=1): - toks = line.strip("\r\n").split("\t") - if toks[3]: + toks = line.strip("\n").split("\t") + try: + if toks[2]: + taxonomy_counter.update([toks[2]]) + taxonomy = [j.strip() for j in toks[2].split(";")] + taxonomy_level_counter["order"].update([taxonomy[3]]) + taxonomy_level_counter["class"].update([taxonomy[2]]) + taxonomy_level_counter["phylum"].update([taxonomy[1]]) + summary_counter.update(["Assigned Taxonomy"]) + if not all(s=='' for s in toks[4:12]): + summary_counter.update(["Assigned Both"]) + except: + print(len(line)) + continue + if not all(s=='' for s in toks[4:12]): summary_counter.update(["Assigned Function"]) - if toks[7]: - summary_counter.update(["Assigned Both"]) - if toks[7]: - taxonomy_counter.update([toks[7]]) - taxonomy = [j.strip() for j in toks[7].split(";")] - taxonomy_level_counter["order"].update([taxonomy[3]]) - taxonomy_level_counter["class"].update([taxonomy[2]]) - taxonomy_level_counter["phylum"].update([taxonomy[1]]) - summary_counter.update(["Assigned Taxonomy"]) return dict( taxonomy_level_counter=taxonomy_level_counter, summary_counter=summary_counter ) - - -def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]): +def parse_files_for_annotation(path): """ - Reads in multiple sample alignments from diamond in a given directory and merges them into - a single pandas.DataFrame. It returns a pandas dataframe for each of th - phylum that is ready to plug into the processing function. Also returns total counts - which is necessary to calculate the percentage of total that is being represented + parses the annotations to retain which samples were assigned function vs + taxonomy vs both """ - dfs = {} - classifications_per_sample = {} - for classification_table in classification_tables: - sample = get_sample(classification_table, "_classifications.txt") - parsed_taxonomy = parse_classifications_for_taxonomy(classification_table) - # assigned #'s in summary table - classifications_per_sample[sample] = parsed_taxonomy["summary_counter"] - if len(dfs) == 0: - for tax_level in tax_levels: - dfs[tax_level] = get_df_at_tax_level( - parsed_taxonomy["taxonomy_level_counter"][tax_level], - sample, - tax_level, - ) - continue - - for tax_level in tax_levels: - df = get_df_at_tax_level( - parsed_taxonomy["taxonomy_level_counter"][tax_level], sample, tax_level - ) - dfs[tax_level] = dfs[tax_level].merge(df, on=tax_level, how="outer") - return pd.DataFrame.from_dict(classifications_per_sample, orient="index") + summary_counter={} + with open(path) as fh: + # skip the header + header = next(fh) + header = header.strip("\n").split("\t") + for sample in header[18:]: + summary_counter[sample]=Counter() + for line in fh: + toks = line.strip("\n").split("\t") + try: + if toks[2]: + for i,sample in enumerate(header[18:]): + if int(toks[18+i]) != 0: + summary_counter[sample].update(["Kaiju+NR"]) + #summary_counter.update() + # if not all(s=='' for s in toks[3:12]): + # for i,sample in enumerate(header[18:]): + # if str(toks[18+i]) != 0: + # summary_counter[sample].update(["Assigned Both"]) + #summary_counter.update(["Assigned Both"]) + except: + print(len(line)) + continue + if not all(s=='' for s in toks[4:12]): + for i,sample in enumerate(header[18:]): + if int(toks[18+i]) != 0: + summary_counter[sample].update(["TIGRFAMs,HAMAP,dbCAN\n hits per sample"]) + return summary_counter + +# def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]): +# """ +# Reads in multiple sample alignments from diamond in a given directory and merges them into +# a single pandas.DataFrame. It returns a pandas dataframe for each of +# phylum that is ready to plug into the processing function. Also returns total counts +# which is necessary to calculate the percentage of total that is being represented +# """ +# dfs = {} +# classifications_per_sample = {} +# cols = df.filter(regex="[A-Z]+").columns.values.tolist() +# samples = df.filter(regex="[0-9]+").columns.values.tolist() +# for sample in samples: +# df[cols] +# +# for classification_table in classification_tables: +# sample = get_sample(classification_table, "_classifications.txt") +# parsed_taxonomy = parse_classifications_for_taxonomy(classification_table) +# # assigned #'s in summary table +# classifications_per_sample[sample] = parsed_taxonomy["summary_counter"] +# if len(dfs) == 0: +# for tax_level in tax_levels: +# dfs[tax_level] = get_df_at_tax_level( +# parsed_taxonomy["taxonomy_level_counter"][tax_level], +# sample, +# tax_level, +# ) +# continue +# +# for tax_level in tax_levels: +# df = get_df_at_tax_level( +# parsed_taxonomy["taxonomy_level_counter"][tax_level], sample, tax_level +# ) +# dfs[tax_level] = dfs[tax_level].merge(df, on=tax_level, how="outer") +# return pd.DataFrame.from_dict(classifications_per_sample, orient="index") def get_df_at_tax_level(count_obj, sample, tax_level): @@ -173,6 +251,7 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam "Unique", "Clean", ] + samp_df = pd.DataFrame.from_dict(classifications_per_sample,orient = "index") count_table = defaultdict(list) # initial read count, join count, join rate, insert size from merge step for merge_log in merge_logs: @@ -196,13 +275,18 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam count_table[sample].append(int(line.strip("\r\n").split("\t")[-1])) log_df = pd.DataFrame.from_dict(count_table, orient="index") + print(log_df.head(10)) + print(log_df.shape) + print(samp_df.shape) log_df.columns = header + print(log_df.head()) # print("classifications", classifications_per_sample.head()) - log_df = log_df.merge(classifications_per_sample, left_index=True, right_index=True) + log_df = log_df.merge(samp_df, left_index=True, right_index=True) + print(log_df.head()) # print(log_df.head()) log_df.reset_index(inplace=True) header.insert(0, "Sample") - header.extend(["Assigned\nFunction", "Assigned\nTaxonomy", "Assigned\nBoth"]) + header.extend(["TIGRFAMs,HAMAP,dbCAN\n hits per sample", "Kaiju+NR"]) log_df.columns = header sample_summary_table = log_df[header].to_html( index=False, @@ -305,20 +389,25 @@ def main( r1_quality_files, html, conda_env, - function_table, + taxonomy_table, zipped_file ): clean_logs = glob(clean_logs) unique_logs = glob(unique_logs) merge_logs = glob(merge_logs) - summary_tables = glob(summary_tables) + summary_table = summary_tables + # parse classifcations for summary + assigned_dict = parse_files_for_annotation(summary_table) r1_quality_files = glob(r1_quality_files) - classifications_per_sample = compile_summary_df(summary_tables) - value_cols = get_sample_name(summary_tables, "_classifications.txt") - fig = build_taxonomy_plot(taxonomy_table, value_cols) + #classifications_per_sample = compile_summary_df(summary_table) + #value_cols = get_sample_name(summary_tables, "_classifications.txt") + fig = build_taxonomy_plot(taxonomy_table) plots = offline.plot(fig, **PLOTLY_PARAMS) + # html_tbl = parse_log_files( + # merge_logs, unique_logs, clean_logs, classifications_per_sample + # ) html_tbl = parse_log_files( - merge_logs, unique_logs, clean_logs, classifications_per_sample + merge_logs, unique_logs, clean_logs, assigned_dict ) quality_plot = build_quality_plot(r1_quality_files) conda_env = get_conda_env_str(conda_env) @@ -333,7 +422,7 @@ def main( PerSeq_ - Per sequence functional and taxonomic assignments ============================================================= -.. _PerSeq: https://github.com/pnnl +.. _PerSeq: https://github.com/PNNL-CompBio/perseq .. contents:: @@ -385,33 +474,54 @@ def main( specified during the decontamination process. Functional annotation and taxonomic classification were performed following the decontamination step. +Translation and Clustering +************************** + +Raw nucleotide sequences are translated into protein sequences using Prodigal[4] +in anonymous mode. These sequences are then clustered at a 90% sequence identity +using mmseqs[5], to decrease the computational resources required to perform +annotation. These representative sequences are then annotated for functional +and taxonomic information. + + Functional Annotation ********************* -The blastx algorithm of DIAMOND [4] was used to align nucleotide sequences to -the KEGG protein reference database [5] consisting of non-redundant, family -level fungal eukaryotes and genus level prokaryotes -(``--strand=both --evalue 0.00001``). The highest scoring alignment per -sequence was used for functional annotation. +There are three distinct hmm libraries used to infer functional potential: +HAMAP[6], dbCAN[7], and TIRGRFAMs[8]. The alignment of each seuqnce to each database +was performed using HMMSearch of HMMer[9]. TIGRFAMs and HAMAP both provide +enzyme commission (EC), gene and product, while dbCAN2 provides enzyme class, +enzyme subclass and EC. Taxonomic Annotation ******************** Kmer-based taxonomic classification was performed on the merged reads using -Kaiju [6] in greedy mode (``-a greedy -E 0.05``). NCBI's nr database [7] +Kaiju [10] in greedy mode (``-a greedy -E 0.05``). NCBI's nr database [11] containing reference sequences for archaea, bacteria, viruses, fungi, and microbial eukaryotes was used as the reference index for Kaiju. +Aggregation +*********** + +Once functional and taxonomic annotation has completed all of the sequences per +sample are aligned to the now annotated representative sequences using DIAMOND[12]. + References ********** 1. Rognes T, Flouri T, Nichols B, Quince C, Mahé F. VSEARCH: a versatile open source tool for metagenomics. PeerJ. PeerJ Inc; 2016;4:e2584. 2. Bushnell B. BBTools [Internet]. Available from: https://sourceforge.net/projects/bbmap/ 3. Li H. Seqtk [Internet]. Available from: https://github.com/lh3/seqtk -4. Buchfink B, Xie C, Huson DH. Fast and sensitive protein alignment using DIAMOND. Nat. Methods. Nature Publishing Group; 2015;12:59–60. -5. Kanehisa M, Sato Y, Kawashima M, Furumichi M, Tanabe M. KEGG as a reference resource for gene and protein annotation. Nucleic Acids Res. 2016;44:D457–62. -6. Menzel P, Ng KL, Krogh A. Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nat Commun. Nature Publishing Group; 2016;7:11257. -7. NCBI Resource Coordinators. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2018;46:D8–D13. +4. fd +5. jdkf +6. DFRA_DIACA +7. DFRA_DIACA +8. def +9. dk +10. Menzel P, Ng KL, Krogh A. Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nat Commun. Nature Publishing Group; 2016;7:11257. +11. NCBI Resource Coordinators. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2018;46:D8–D13. +12. Buchfink B, Xie C, Huson DH. Fast and sensitive protein alignment using DIAMOND. Nat. Methods. Nature Publishing Group; 2015;12:59–60. Execution Environment @@ -424,10 +534,10 @@ def main( Output ------ -Classification Tables -********************* +Annotations Table +***************** -Per sample classifications in tables/ contain: +Table containing annotations across all samples: .. table:: :name: classificationtable @@ -435,8 +545,21 @@ def main( ========================= ========================================================================================================================================== Header ID Definition ========================= ========================================================================================================================================== - aa_alignment_length The length of the DIAMOND blastx hit - aa_percent_id The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency + seq + kaiju_length + kaiju_taxonomy + tigrfams_ec + tigrfams_gene + tigrfams_product + tigrfams_score + tigrfams_evalue + hamap_ec + hamap_gene + hamap_product + hamap_score + hamap_evalue + _ec The length of the DIAMOND blastx hit + _gene The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency ec Enzyme Commission number from KEGG; semicolon delimited where multiple ko KEGG entry ID product KEGG gene ID KEGG product @@ -468,41 +591,43 @@ def main( Function ```````` -Per function assignments in tables named **summaries/function/.txt** +Per function assignments in tables named **summaries/hmms_summary/.txt** contain: .. table:: :name: functiondeftable - ==================== =================================================================== - Header ID Definition - ==================== =================================================================== - either KO, EC, or product into which counts have been summed - samples names non-normalized, per sample sum for this particular functional group - level_1 KEGG hierarchy [level 1] if KO defined in first column - level_2 KEGG hierarchy [level 2] if KO defined in first column - level_3 KEGG hierarchy [level 3] if KO defined in first column - ==================== =================================================================== + ==================== =================================================================== + Header ID Definition + ==================== =================================================================== + _ec either HAMAP,dbCAN or TIGRFAMs EC + _gene either HAMAP,dbCAN or TIGRFAMs gene + _product either HAMAP,dbCAN or TIGRFAMs product + _enzyme_class dbcan only enzyme class descriptor + _enzyme_class_subfamily dbcan only enzyme class subfamily descriptor + samples names non-normalized, per sample sum for this particular functional group + ==================== =================================================================== Combined ```````` Per taxonomy+function assignments in tables named -**summaries/combined/_.txt** contain: +**summaries/combined/_.txt** contain: .. table:: :name: combineddeftable - ==================== ==================================================================== - Header ID Definition - ==================== ==================================================================== - either KO, EC, or product; counts are summed using + - taxonomy_ taxonomic level; counts are summed using + - sample names non-normalized, per sample sum for this particular functional group - level_1 KEGG hierarchy [level 1] if KO defined in first column - level_2 KEGG hierarchy [level 2] if KO defined in first column - level_3 KEGG hierarchy [level 3] if KO defined in first column - ==================== ==================================================================== + ==================== ==================================================================== + Header ID Definition + ==================== ==================================================================== + _ec either HAMAP,dbCAN or TIGRFAMs EC + _gene either HAMAP,dbCAN or TIGRFAMs gene + _product either HAMAP,dbCAN or TIGRFAMs product + _enzyme_class dbcan only enzyme class descriptor + _enzyme_class_subfamily dbcan only enzyme class subfamily descriptor + taxonomy_ taxonomic level; counts are summed using + + sample names non-normalized, per sample sum for this particular functional group + ==================== ==================================================================== Downloads --------- @@ -530,7 +655,7 @@ def main( p.add_argument("--r1-quality-files") p.add_argument("--html") p.add_argument("conda_env") - p.add_argument("function_table") + p.add_argument("taxonomy_table") p.add_argument("zipped_file") # p.add_argument("taxonomy_table") # p.add_argument("taxonomy_function_table") @@ -545,7 +670,7 @@ def main( args.r1_quality_files, args.html, args.conda_env, - args.function_table, + args.taxonomy_table, args.zipped_file, # args.taxonomy_table, # args.taxonomy_function_table, diff --git a/scripts/make_classification_table.py b/scripts/make_classification_table.py old mode 100644 new mode 100755 index 6214129..ac99a76 --- a/scripts/make_classification_table.py +++ b/scripts/make_classification_table.py @@ -48,11 +48,11 @@ def main(kaiju, hamap, dbcan, tigrfams, diamond, output): sequence_counts = parse_diamond_outputs(diamond) # ECs may be a comma delimited list - # sequence_id -> [ec, gene, product, hmm_id, score, evalue] + # sequence_id -> [ec, gene, product, hmm_id, evalue, score] hamap_hits = parse_hmm_hits(hamap) - # sequence_id -> [ec, enzyme class, enzyme class subfamily, hmm_id, score, evalue] + # sequence_id -> [ec, enzyme class, enzyme class subfamily, hmm_id, evalue, score] dbcan_hits = parse_hmm_hits(dbcan) - # sequence_id -> [ec, gene, product, hmm_id, score, evalue] + # sequence_id -> [ec, gene, product, hmm_id, evalue, score] tigrfams_hits = parse_hmm_hits(tigrfams) output_header = [ @@ -108,7 +108,7 @@ def main(kaiju, hamap, dbcan, tigrfams, diamond, output): tigrfams_ec, tigrfams_gene, tigrfams_product, - tigerfams_score, + tigrfams_score, tigrfams_evalue, hamap_ec, hamap_gene, diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py index 9e2afa0..8853d67 100755 --- a/scripts/summarize_classifications.py +++ b/scripts/summarize_classifications.py @@ -57,71 +57,89 @@ # logging.debug(f"Final table length: {len(df)}") # return df -def single_output(df,tax_level,samples,min_evlaue,min_score,min_len): +def single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output): split_idx = TAX_LEVELS[tax_level] - group_one=["tigrfams","hamap","dbcan","kaiju"] - for item in group_one: - hmm_cols = df.filter(like=item).columns.values.tolist() + # group_one=["tigrfams","hamap","dbcan","kaiju"] + # for item in group_on + #test_df.head() + if "kaiju_taxonomy" in group_on: + hmm_cols = df.filter(like="kaiju").columns.values.tolist() + test_df = df[hmm_cols+samples].copy() + test_df = test_df[(df["kaiju_length"] > min_len)] + test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx])) + test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() + test_df = test_df.rename( + columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} + ) + else: + hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist() #TODO might not need test_df = df[hmm_cols+samples].copy() - if "kaiju" in item: - test_df = test_df[(df["kaiju_length"] > min_len)] - test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) - test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() - test_df = test_df.rename( - columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} - ) - else: - metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() - test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] - test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head() - test_df.to_csv(item+output, sep="\t", index=False) + metric_cols = test_df.filter(items=[group_on[0]+"_score",group_on[0]+"_evalue"]).columns.values.tolist() + test_df = test_df[(test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)] + test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index() + test_df.to_csv(output, sep="\t", index=False) -def combined_output(df,tax_level,samples,min_evalue,min_score,min_len): +def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output): split_idx = TAX_LEVELS[tax_level] - group_one=["tigrfams","hamap","dbcan"] + # group_one=["tigrfams","hamap","dbcan"] kaiju_cols = df.filter(like="kaiju").columns.values.tolist() - for item in group_one: - hmm_cols = df.filter(like=item).columns.values.tolist() - #TODO might not need - test_df = df[kaiju_cols+hmm_cols+samples].copy() - test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] - test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) - metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() - metric_cols.append("kaiju_length") - test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index() - test_df = test_df.rename( - columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} - ) - test_df.to_csv("taxonomy_"+item, sep="\t", index=False) + # for item in group_one: + hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist() + print(samples) + print(group_on[0]) + #TODO might not need + test_df = df[kaiju_cols+hmm_cols+samples].copy() + test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)] + test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx])) + metric_cols = test_df.filter(items=[group_on[0].lower()+"_score",group_on[0].lower()+"_evalue"]).columns.values.tolist() + print(metric_cols) + metric_cols.append("kaiju_length") + test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index() + test_df = test_df.rename( + columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} + ) + test_df.to_csv(output, sep="\t", index=False) -def main(output, table, tax_level, min_evalue, min_score, min_len): - df = pd.read_table(table) +def main(output, table, tax_level, min_evalue, min_score, min_len,group_on): + df = pd.read_table(table,low_memory=False) + col_names = [ + "seq","kaiju_length","kaiju_taxonomy", "tigrfams_ec", "tigrfams_gene", "tigrfams_product", + "tigrfams_score", "tigrfams_evalue", "hamap_ec", "hamap_gene", + "hamap_product", "hamap_score", "hamap_evalue", "dbcan_ec", + "dbcan_enzyme_class", "dbcan_enzyme_class_subfamily","dbcan_score", "dbcan_evalue" + ] # remove the rows w/ no information - df = df.query("kaiju_length != 0") - samples = df.filter(regex="[0-9]+").columns.values.tolist() - split_idx = TAX_LEVELS[tax_level] - group_one=["tigrfams","hamap","dbcan","kaiju"] - for item in group_one: - print(item) - use_these = df.filter(like=item).columns.values.tolist() - print(use_these) - #TODO might not need - test_df = df[use_these+samples].copy() - if "kaiju" in item: - test_df = test_df[(df["kaiju_length"] > min_len)] - test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) - test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() - test_df = test_df.rename( - columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} - ) - else: - metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() - test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] - test_df = test_df.drop(columns=metric_cols).groupby(use_these[0:3]).sum(axis=1).reset_index().head() - test_df.to_csv(item+output, sep="\t", index=False) + # df = df.dropna( + # subset= col_names, + # thresh=2) + samples = [c for c in df.columns.values.tolist() if c not in col_names] + if len(group_on) > 1: + combined_output(df,tax_level,samples,min_evalue, min_score, min_len, group_on,output) + else: + single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output) + # split_idx = TAX_LEVELS[tax_level] + # group_one=["tigrfams","hamap","dbcan","kaiju"] + # for item in group_one: + # print(item) + # use_these = df.filter(like=item).columns.values.tolist() + # print(use_these) + # #TODO might not need + # test_df = df[use_these+samples].copy() + # if "kaiju" in item: + # test_df = test_df[(df["kaiju_length"] > min_len)] + # test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx])) + # test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index() + # test_df = test_df.rename( + # columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"} + # ) + # else: + # metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist() + # test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)] + # test_df = test_df.drop(columns=metric_cols).groupby(use_these[0:3]).sum(axis=1).reset_index().head() + # test_df.to_csv(item+output, sep="\t", index=False) # tax_split_level = TAX_LEVELS[tax_level] # sample_df = None # logging.debug(f"Preparing to parse {len(tables)} tables") @@ -169,7 +187,7 @@ def main(output, table, tax_level, min_evalue, min_score, min_len): ) parser.add_argument( "--min-evalue", - type=int, + type=float, default=0.001, help="lowest evalue to retain per sequence per sample", ) @@ -181,17 +199,17 @@ def main(output, table, tax_level, min_evalue, min_score, min_len): ) parser.add_argument( "--min-len", - type=int, + type=float, default=0.001, help="minimum kaiju alignment length to retain", ) - # parser.add_argument( - # "--group-on", - # # choices=["kaiju_classification", "ko", "ec", "product"], - # nargs="+", - # default="kaiju_taxonomy", - # help="cluster tables on taxonomy and/or function; headers must match in tables", - # ) + parser.add_argument( + "--group-on", + # choices=["kaiju_classification", "ko", "ec", "product"], + nargs="+", + default="kaiju_taxonomy", + help="cluster tables on taxonomy and/or function; headers must match in tables", + ) parser.add_argument( "--verbose", action="store_true", help="increase output verbosity" ) @@ -205,5 +223,5 @@ def main(output, table, tax_level, min_evalue, min_score, min_len): args.min_evalue, args.min_score, args.min_len, - # args.group_on, + args.group_on, )