PNNL-CompBio · nzavoshy · Nov 27, 2018 · Nov 27, 2018 · Nov 28, 2018 · Nov 28, 2018
diff --git a/Snakefile b/Snakefile
@@ -136,12 +136,12 @@ def get_summaries():
     hmm = ["TIGRFAMs","HAMAP","dbCAN"]
     taxonomy = ["phylum", "class", "order"]
     # code to generate the possible files
-    file_paths = expand("summaries/combined/{hmm}_{taxonomy}.txt",
-        hmm=hmm, taxonomy=taxonomy)
+    file_paths = expand("summaries/combined/{hmm}_{tax_classification}.txt",
+        hmm=hmm, tax_classification=taxonomy)
     file_paths.extend(expand("summaries/hmms_summary/{hmm}_summary.txt",
         hmm=hmm))
-    file_paths.extend(expand("summaries/taxonomy/{taxonomy}.txt",
-        taxonomy=taxonomy))
+    file_paths.extend(expand("summaries/taxonomy/{tax_classification}.txt",
+        tax_classification=taxonomy))
     return file_paths
 
 
@@ -394,7 +394,7 @@ rule aggregate_all_genes:
                 with open(f) as fh:
                     for name, seq, _ in readfx(fh):
                         # per sequence, choose first only
-                        if name.endswith("_2"):
+                        if not name.endswith("_1"):
                             continue
                         print(">%d" % name_index, seq.replace("*", ""), sep="\n", file=out_faa)
                         name_index += 1
@@ -481,7 +481,7 @@ rule split_fasta:
     input:
         fasta = "gene_catalog/clustered_genes.faa"
     output:
-        temp((dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa")))
+        dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa")
     params:
         # consider a smaller chunk size
         chunk_size = config.get("chunk_size", 1000000)
@@ -621,9 +621,23 @@ rule combine_sample_output:
         """
 
 
+rule remove_empty_annotations:
+    input:
+        annotations = "gene_catalog/annotations.txt"
+    output:
+        cleaned = "gene_catalog/annotations_cleaned.txt"
+    run:
+        with open(input.annotations) as file, open(output.cleaned,"w") as output:
+            header = next(file)
+            print(header.strip("\n"), file=output)
+            for line in file:
+                toks = line.strip("\r\n").split("\t")
+                if not all(s=='' for s in toks[2:12]):
+                    print(line.strip("\n"), file=output)
+
 rule build_hmms_table:
     input:
-        "gene_catalog/annotations.txt"
+        "gene_catalog/annotations_cleaned.txt"
     output:
         "summaries/hmms_summary/{hmm}_summary.txt"
     params:
@@ -645,7 +659,7 @@ rule build_hmms_table:
 
 rule build_tax_table:
     input:
-        "gene_catalog/annotations.txt"
+        "gene_catalog/annotations_cleaned.txt"
     output:
         "summaries/taxonomy/{tax_classification}.txt"
     params:
@@ -659,15 +673,15 @@ rule build_tax_table:
     shell:
         """
         python scripts/summarize_classifications.py \
-            --group-on kaiju_classification --min-evalue {params.min_evalue} \
+            --group-on kaiju_taxonomy --min-evalue {params.min_evalue} \
             --min-score {params.min_score} --min-len {params.min_len} \
             --tax-level {wildcards.tax_classification} {output} {input}
         """
 
 
 rule build_hmm_and_tax_table:
     input:
-        "gene_catalog/annotations.txt"
+        "gene_catalog/annotations_cleaned.txt"
     output:
         "summaries/combined/{hmm}_{tax_classification}.txt"
     params:
@@ -679,12 +693,12 @@ rule build_hmm_and_tax_table:
     shell:
         """
         python scripts/summarize_classifications.py \
-            --group-on {wildcards.hmm} kaiju_classification \
+            --group-on {wildcards.hmm} kaiju_taxonomy \
             --tax-level {wildcards.tax_classification} --min-evalue {params.min_evalue} \
-            --min-score {params.min_score}--min-len {params.min_len} {output} {input}
+            --min-score {params.min_score} --min-len {params.min_len} {output} {input}
         """
 
-## TODO fix this rule
+
 rule build_krona_ec_input:
     input:
         ec_file = "summaries/hmms_summary/TIGRFAMs_summary.txt",
@@ -731,30 +745,34 @@ rule build_krona_plots:
         """
 
 
-# rule zip_attachments:
-#     input:
-#         function = "summaries/function/ec.txt",
-#         taxonomy = "summaries/taxonomy/order.txt",
-#         krona_tax = "krona_plots/tax.krona.html",
-#         krona_ec = "krona_plots/ec.krona.html"
-#     output:
-#         temp("perseq_downloads.zip")
-#     shell:
-#         """
-#         zip {output} {input.function} {input.taxonomy} {input.combined}
-#         """
+rule zip_attachments:
+    input:
+        function = "summaries/hmms_summary/TIGRFAMs_summary.txt",
+        taxonomy = "summaries/taxonomy/order.txt",
+        combined = "summaries/combined/TIGRFAMs_order.txt",
+        krona_tax = "krona_plots/tax.krona.html",
+        krona_ec = "krona_plots/ec.krona.html"
+    output:
+        "perseq_downloads.tar.gz"
+    conda:
+        CONDAENV
+    shell:
+        """
+        tar -czf {output} {input.function} {input.taxonomy} {input.combined} \
+        {input.krona_tax} {input.krona_ec}
+        """
 
 
 rule build_report:
     input:
-        annotations = "gene_catalog/annotations.txt",
+        annotations = "gene_catalog/annotations_cleaned.txt",
         ee_stats = expand("logs/{sample}_{idx}_eestats.txt", sample=config["samples"].keys(), idx=["R1", "R2"]),
         clean_length_logs = expand("logs/{sample}_03_clean_readlengths.txt", sample=config["samples"].keys()),
         unique_length_logs = expand("logs/{sample}_02_unique_readlengths.txt", sample=config["samples"].keys()),
         clean_logs = expand("logs/{sample}_decontamination.log", sample=config["samples"].keys()),
         merge_logs = expand("logs/{sample}_merge_sequences.log", sample=config["samples"].keys()),
         taxonomy = "summaries/taxonomy/order.txt",
-        zipped_files = "perseq_downloads.zip"
+        zipped_files = "perseq_downloads.tar.gz"
     output:
         "summary.html"
     conda:

diff --git a/envs/environment.yml b/envs/environment.yml
@@ -19,5 +19,5 @@ dependencies:
     - seqtk=1.3
     - pip
     - pip:
-      - relatively
+      - relatively>=1.0.5
       - plotly>=3.0
diff --git a/scripts/build_krona.py b/scripts/build_krona.py
@@ -17,11 +17,10 @@ def build_tax(tax_file, output):
     # read in tax file to order level from build_tax_table rule
     test_tax = pd.read_table(tax_file)
     col_names = list(test_tax)[1:]
-    taxonomies = ["Kingdom", "Phylum", "Class", "Order"]
     # split taxonomy column into hierarchy
     test_tax[
         ["Kingdom", "Phylum", "Class", "Order"]
-    ] = test_tax.taxonomy_order.str.split("; ", expand=True)
+    ] = test_tax.taxonomy_order.str.split("; ",n=4, expand=True)
     # generate individual files per sample
     for item in col_names:
         df_name = f"test_tax_{item}"
@@ -85,72 +84,106 @@ def build_ec_dict(ec_filename, dat_filename):
     return d
 
 
+# def parse_ec_file(ec_file_from_summaries):
+#     """
+#     Split sequences that aligned to multiple ec numbers into last common ec number
+#     shared amongst matches
+#     """
+#     with gzopen(ec_file_from_summaries) as ec_file_sum:
+#
+#         next(ec_file_sum)
+#         new_ec_dict = {"ec": []}
+#         # split ec numbers that mapped to multiple
+#         for item in ec_file_sum:
+#             toks = item.partition("\t")[0]
+#             ec = toks.split(";")
+#             # print(ec)
+#             first_item = ec[0][:5]
+#             if len(ec) == 1:
+#                 new_ec_dict["ec"].append(ec[0])
+#             else:
+#                 n = 0
+#                 for item in ec[1:]:
+#                     if item[:5] == first_item:
+#                         new_ec_dict["ec"].append(item[:5] + ".-")
+#                         # print('same',item,first_item)
+#                     elif item[:3] == first_item[:3]:
+#                         new_ec_dict["ec"].append(item[:3] + ".-.-")
+#                         # print('same to high level',item,first_item)
+#                     elif item[:1] == first_item[:1]:
+#                         new_ec_dict["ec"].append(item[:1] + ".-.-.-")
+#     return new_ec_dict
+
 def parse_ec_file(ec_file_from_summaries):
     """
-    Split sequences that aligned to multiple ec numbers into last common ec number
-    shared amongst matches
+    For ec numbers that aligned to multiple ecs in TIGRFAMs, the first entry is
+    the only one that will be retained for Krona plots.
     """
-    with gzopen(ec_file_from_summaries) as ec_file_sum:
-
-        next(ec_file_sum)
-        new_ec_dict = {"ec": []}
-        # split ec numbers that mapped to multiple
-        for item in ec_file_sum:
-            toks = item.partition("\t")[0]
-            ec = toks.split(";")
-            # print(ec)
-            first_item = ec[0][:5]
-            if len(ec) == 1:
-                new_ec_dict["ec"].append(ec[0])
-            else:
-                n = 0
-                for item in ec[1:]:
-                    if item[:5] == first_item:
-                        new_ec_dict["ec"].append(item[:5] + ".-")
-                        # print('same',item,first_item)
-                    elif item[:3] == first_item[:3]:
-                        new_ec_dict["ec"].append(item[:3] + ".-.-")
-                        # print('same to high level',item,first_item)
-                    elif item[:1] == first_item[:1]:
-                        new_ec_dict["ec"].append(item[:1] + ".-.-.-")
-    return new_ec_dict
-
-
-def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output):
+    ec_df = pd.read_table("ec_file_from_summaries")
+    samples = ec_df.filter(regex="[0-9]+").columns.values.tolist()
+    ec_df = ec_df[["tigrfams_ec"]+samples]
+    ec_df = ec_df.groupby(["tigrfams_ec"]).sum(axis=1).reset_index()
+    ec_df[["tigrfams_ec"]] = ec_df.tigrfams_ec.str.split(",",n=1).str[0]
+    return ec_df
+
+
+# def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output):
+#     """
+#     Splits the ec number counts and hierarchy by sample
+#     """
+#     #ec_replace = pd.DataFrame.from_dict(new_ec_dict)
+#     ec_table = pd.read_table(ec_file_from_summaries)
+#     col_names = list(ec_table)[1:]
+#     #ec_table["ec"] = ec_replace["ec"]
+#     ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index()
+#     ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"})
+#     # print(ec_hier_dict.head(5))
+#     grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner")
+#     # print(grouped_ec_tbl.head(5))
+#     grouped_ec_tbl = grouped_ec_tbl.rename(
+#         columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"}
+#     )
+#     # split the samples into separate files
+#     for item in col_names:
+#         df_name = f"test_tax_{item}"
+#         df_name = grouped_ec_tbl[
+#             [f"{item}", "level_1", "level_2", "level_3", "level_4"]
+#         ]
+#
+#         df_name.to_csv(
+#             output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False
+#         )
+def build_ec_output(ec_df, parsed_dict,output):
     """
     Splits the ec number counts and hierarchy by sample
     """
-    ec_replace = pd.DataFrame.from_dict(new_ec_dict)
-    ec_table = pd.read_table(ec_file_from_summaries)
-    col_names = list(ec_table)[1:]
-    ec_table["ec"] = ec_replace["ec"]
+    ec_table = ec_df
+    samples = list(ec_table)[1:]
     ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index()
-    ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"})
-    # print(ec_hier_dict.head(5))
-    grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner")
-    # print(grouped_ec_tbl.head(5))
+    ec_hier_dict = ec_hier_dict.rename(columns={"index": "tigrfams_ec"})
+    grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="tigrfams_ec", how="inner")
     grouped_ec_tbl = grouped_ec_tbl.rename(
         columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"}
     )
     # split the samples into separate files
-    for item in col_names:
-        df_name = f"test_tax_{item}"
+    for sample in samples:
+        #df_name = f"test_tax_{sample}"
         df_name = grouped_ec_tbl[
-            [f"{item}", "level_1", "level_2", "level_3", "level_4"]
+            [f"{sample}", "level_1", "level_2", "level_3", "level_4"]
         ]
-
+        print(df_name.head())
         df_name.to_csv(
-            output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False
+            output + "/" + f"{sample}" + "_ec.txt", sep="\t", index=False, header=False
         )
 
 
 def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file):
     if tax_file is not None:
         build_tax(tax_file, output)
     else:
-        new_ec_dict = parse_ec_file(ec_file_from_summaries)
+        ec_df = parse_ec_file(ec_file_from_summaries)
         parsed_dict = build_ec_dict(ec_file, dat_file)
-        build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output)
+        build_ec_output(ec_df, parsed_dict, output)
 
 
 if __name__ == "__main__":
@@ -167,7 +200,7 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file):
     p.add_argument(
         "--ec-file-from-summaries",
         type=str,
-        help="file that contains the ec numbers from the diamond alignment",
+        help="file that contains the ec numbers from TIGRFAMs summary",
     )
     p.add_argument(
         "--dat-file",