From 4945d8c9b56dcc3aedcc027d9e3647f1e836cb02 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 27 Nov 2018 12:49:54 -0800
Subject: [PATCH 01/14] fixed ec krona building

---
 Snakefile              |  32 ++++++-----
 scripts/build_krona.py | 127 ++++++++++++++++++++++++++---------------
 2 files changed, 97 insertions(+), 62 deletions(-)

diff --git a/Snakefile b/Snakefile
index 463699e..cedabe9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -394,7 +394,7 @@ rule aggregate_all_genes:
                 with open(f) as fh:
                     for name, seq, _ in readfx(fh):
                         # per sequence, choose first only
-                        if name.endswith("_2"):
+                        if not name.endswith("_1"):
                             continue
                         print(">%d" % name_index, seq.replace("*", ""), sep="\n", file=out_faa)
                         name_index += 1
@@ -481,7 +481,7 @@ rule split_fasta:
     input:
         fasta = "gene_catalog/clustered_genes.faa"
     output:
-        temp((dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa")))
+        dynamic("gene_catalog/tmp/clustered_genes_{chunk}.faa")
     params:
         # consider a smaller chunk size
         chunk_size = config.get("chunk_size", 1000000)
@@ -684,7 +684,7 @@ rule build_hmm_and_tax_table:
             --min-score {params.min_score}--min-len {params.min_len} {output} {input}
         """
 
-## TODO fix this rule
+
 rule build_krona_ec_input:
     input:
         ec_file = "summaries/hmms_summary/TIGRFAMs_summary.txt",
@@ -731,18 +731,20 @@ rule build_krona_plots:
         """
 
 
-# rule zip_attachments:
-#     input:
-#         function = "summaries/function/ec.txt",
-#         taxonomy = "summaries/taxonomy/order.txt",
-#         krona_tax = "krona_plots/tax.krona.html",
-#         krona_ec = "krona_plots/ec.krona.html"
-#     output:
-#         temp("perseq_downloads.zip")
-#     shell:
-#         """
-#         zip {output} {input.function} {input.taxonomy} {input.combined}
-#         """
+rule zip_attachments:
+    input:
+        function = "summaries/hmms_summary/TIGRFAMs_summary.txt",
+        taxonomy = "summaries/taxonomy/order.txt",
+        combined = "summaries/combined/TIGRFAMs_order.txt",
+        krona_tax = "krona_plots/tax.krona.html",
+        krona_ec = "krona_plots/ec.krona.html"
+    output:
+        temp("perseq_downloads.zip")
+    shell:
+        """
+        zip {output} {input.function} {input.taxonomy} {input.combined} \
+        {input.krona_tax} {input.krona_ec}
+        """
 
 
 rule build_report:
diff --git a/scripts/build_krona.py b/scripts/build_krona.py
index 6222eed..0fc5642 100755
--- a/scripts/build_krona.py
+++ b/scripts/build_krona.py
@@ -17,11 +17,10 @@ def build_tax(tax_file, output):
     # read in tax file to order level from build_tax_table rule
     test_tax = pd.read_table(tax_file)
     col_names = list(test_tax)[1:]
-    taxonomies = ["Kingdom", "Phylum", "Class", "Order"]
     # split taxonomy column into hierarchy
     test_tax[
         ["Kingdom", "Phylum", "Class", "Order"]
-    ] = test_tax.taxonomy_order.str.split("; ", expand=True)
+    ] = test_tax.taxonomy_order.str.split("; ",n=4, expand=True)
     # generate individual files per sample
     for item in col_names:
         df_name = f"test_tax_{item}"
@@ -85,62 +84,96 @@ def build_ec_dict(ec_filename, dat_filename):
     return d
 
 
+# def parse_ec_file(ec_file_from_summaries):
+#     """
+#     Split sequences that aligned to multiple ec numbers into last common ec number
+#     shared amongst matches
+#     """
+#     with gzopen(ec_file_from_summaries) as ec_file_sum:
+#
+#         next(ec_file_sum)
+#         new_ec_dict = {"ec": []}
+#         # split ec numbers that mapped to multiple
+#         for item in ec_file_sum:
+#             toks = item.partition("\t")[0]
+#             ec = toks.split(";")
+#             # print(ec)
+#             first_item = ec[0][:5]
+#             if len(ec) == 1:
+#                 new_ec_dict["ec"].append(ec[0])
+#             else:
+#                 n = 0
+#                 for item in ec[1:]:
+#                     if item[:5] == first_item:
+#                         new_ec_dict["ec"].append(item[:5] + ".-")
+#                         # print('same',item,first_item)
+#                     elif item[:3] == first_item[:3]:
+#                         new_ec_dict["ec"].append(item[:3] + ".-.-")
+#                         # print('same to high level',item,first_item)
+#                     elif item[:1] == first_item[:1]:
+#                         new_ec_dict["ec"].append(item[:1] + ".-.-.-")
+#     return new_ec_dict
+
 def parse_ec_file(ec_file_from_summaries):
     """
-    Split sequences that aligned to multiple ec numbers into last common ec number
-    shared amongst matches
+    For ec numbers that aligned to multiple ecs in TIGRFAMs, the first entry is
+    the only one that will be retained for Krona plots.
     """
-    with gzopen(ec_file_from_summaries) as ec_file_sum:
-
-        next(ec_file_sum)
-        new_ec_dict = {"ec": []}
-        # split ec numbers that mapped to multiple
-        for item in ec_file_sum:
-            toks = item.partition("\t")[0]
-            ec = toks.split(";")
-            # print(ec)
-            first_item = ec[0][:5]
-            if len(ec) == 1:
-                new_ec_dict["ec"].append(ec[0])
-            else:
-                n = 0
-                for item in ec[1:]:
-                    if item[:5] == first_item:
-                        new_ec_dict["ec"].append(item[:5] + ".-")
-                        # print('same',item,first_item)
-                    elif item[:3] == first_item[:3]:
-                        new_ec_dict["ec"].append(item[:3] + ".-.-")
-                        # print('same to high level',item,first_item)
-                    elif item[:1] == first_item[:1]:
-                        new_ec_dict["ec"].append(item[:1] + ".-.-.-")
-    return new_ec_dict
-
-
-def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output):
+    ec_df = pd.read_table("ec_file_from_summaries")
+    samples = ec_df.filter(regex="[0-9]+").columns.values.tolist()
+    ec_df = ec_df[["tigrfams_ec"]+samples]
+    ec_df = ec_df.groupby(["tigrfams_ec"]).sum(axis=1).reset_index()
+    ec_df[["tigrfams_ec"]] = ec_df.tigrfams_ec.str.split(",",n=1).str[0]
+    return ec_df
+
+
+# def build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output):
+#     """
+#     Splits the ec number counts and hierarchy by sample
+#     """
+#     #ec_replace = pd.DataFrame.from_dict(new_ec_dict)
+#     ec_table = pd.read_table(ec_file_from_summaries)
+#     col_names = list(ec_table)[1:]
+#     #ec_table["ec"] = ec_replace["ec"]
+#     ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index()
+#     ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"})
+#     # print(ec_hier_dict.head(5))
+#     grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner")
+#     # print(grouped_ec_tbl.head(5))
+#     grouped_ec_tbl = grouped_ec_tbl.rename(
+#         columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"}
+#     )
+#     # split the samples into separate files
+#     for item in col_names:
+#         df_name = f"test_tax_{item}"
+#         df_name = grouped_ec_tbl[
+#             [f"{item}", "level_1", "level_2", "level_3", "level_4"]
+#         ]
+#
+#         df_name.to_csv(
+#             output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False
+#         )
+def build_ec_output(ec_df, parsed_dict,output):
     """
     Splits the ec number counts and hierarchy by sample
     """
-    ec_replace = pd.DataFrame.from_dict(new_ec_dict)
-    ec_table = pd.read_table(ec_file_from_summaries)
-    col_names = list(ec_table)[1:]
-    ec_table["ec"] = ec_replace["ec"]
+    ec_table = ec_df
+    samples = list(ec_table)[1:]
     ec_hier_dict = pd.DataFrame.from_dict(parsed_dict, orient="index").reset_index()
-    ec_hier_dict = ec_hier_dict.rename(columns={"index": "ec"})
-    # print(ec_hier_dict.head(5))
-    grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="ec", how="inner")
-    # print(grouped_ec_tbl.head(5))
+    ec_hier_dict = ec_hier_dict.rename(columns={"index": "tigrfams_ec"})
+    grouped_ec_tbl = ec_table.merge(ec_hier_dict, on="tigrfams_ec", how="inner")
     grouped_ec_tbl = grouped_ec_tbl.rename(
         columns={0: "level_1", 1: "level_2", 2: "level_3", 3: "level_4"}
     )
     # split the samples into separate files
-    for item in col_names:
-        df_name = f"test_tax_{item}"
+    for sample in samples:
+        #df_name = f"test_tax_{sample}"
         df_name = grouped_ec_tbl[
-            [f"{item}", "level_1", "level_2", "level_3", "level_4"]
+            [f"{sample}", "level_1", "level_2", "level_3", "level_4"]
         ]
-
+        print(df_name.head())
         df_name.to_csv(
-            output + "/" + f"{item}" + "_ec.txt", sep="\t", index=False, header=False
+            output + "/" + f"{sample}" + "_ec.txt", sep="\t", index=False, header=False
         )
 
 
@@ -148,9 +181,9 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file):
     if tax_file is not None:
         build_tax(tax_file, output)
     else:
-        new_ec_dict = parse_ec_file(ec_file_from_summaries)
+        ec_df = parse_ec_file(ec_file_from_summaries)
         parsed_dict = build_ec_dict(ec_file, dat_file)
-        build_ec_output(new_ec_dict, ec_file_from_summaries, parsed_dict, output)
+        build_ec_output(ec_df, parsed_dict, output)
 
 
 if __name__ == "__main__":
@@ -167,7 +200,7 @@ def main(tax_file, output, ec_file, ec_file_from_summaries, dat_file):
     p.add_argument(
         "--ec-file-from-summaries",
         type=str,
-        help="file that contains the ec numbers from the diamond alignment",
+        help="file that contains the ec numbers from TIGRFAMs summary",
     )
     p.add_argument(
         "--dat-file",

From bc175b739acbde095ab041e043fbbe7aa9ea60ce Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 27 Nov 2018 12:50:44 -0800
Subject: [PATCH 02/14] altered summary scripts

---
 scripts/make_classification_table.py |   8 +-
 scripts/summarize_classifications.py | 137 +++++++++++++++------------
 2 files changed, 78 insertions(+), 67 deletions(-)
 mode change 100644 => 100755 scripts/make_classification_table.py

diff --git a/scripts/make_classification_table.py b/scripts/make_classification_table.py
old mode 100644
new mode 100755
index 6214129..ac99a76
--- a/scripts/make_classification_table.py
+++ b/scripts/make_classification_table.py
@@ -48,11 +48,11 @@ def main(kaiju, hamap, dbcan, tigrfams, diamond, output):
     sequence_counts = parse_diamond_outputs(diamond)
 
     # ECs may be a comma delimited list
-    # sequence_id -> [ec, gene, product, hmm_id, score, evalue]
+    # sequence_id -> [ec, gene, product, hmm_id, evalue, score]
     hamap_hits = parse_hmm_hits(hamap)
-    # sequence_id -> [ec, enzyme class, enzyme class subfamily, hmm_id, score, evalue]
+    # sequence_id -> [ec, enzyme class, enzyme class subfamily, hmm_id, evalue, score]
     dbcan_hits = parse_hmm_hits(dbcan)
-    # sequence_id -> [ec, gene, product, hmm_id, score, evalue]
+    # sequence_id -> [ec, gene, product, hmm_id, evalue, score]
     tigrfams_hits = parse_hmm_hits(tigrfams)
 
     output_header = [
@@ -108,7 +108,7 @@ def main(kaiju, hamap, dbcan, tigrfams, diamond, output):
                 tigrfams_ec,
                 tigrfams_gene,
                 tigrfams_product,
-                tigerfams_score,
+                tigrfams_score,
                 tigrfams_evalue,
                 hamap_ec,
                 hamap_gene,
diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py
index 9e2afa0..eb0f89e 100755
--- a/scripts/summarize_classifications.py
+++ b/scripts/summarize_classifications.py
@@ -57,71 +57,82 @@
 #     logging.debug(f"Final table length: {len(df)}")
 #     return df
 
-def single_output(df,tax_level,samples,min_evlaue,min_score,min_len):
+def single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output):
     split_idx = TAX_LEVELS[tax_level]
-    group_one=["tigrfams","hamap","dbcan","kaiju"]
-    for item in group_one:
-        hmm_cols = df.filter(like=item).columns.values.tolist()
-        #TODO might not need
-        test_df = df[hmm_cols+samples].copy()
-        if "kaiju" in item:
-            test_df = test_df[(df["kaiju_length"] > min_len)]
-            test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx]))
-            test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index()
-            test_df = test_df.rename(
-                columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"}
-            )
-        else:
-            metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist()
-            test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)]
-            test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head()
-        test_df.to_csv(item+output, sep="\t", index=False)
-
-
-def combined_output(df,tax_level,samples,min_evalue,min_score,min_len):
-    split_idx = TAX_LEVELS[tax_level]
-    group_one=["tigrfams","hamap","dbcan"]
-    kaiju_cols = df.filter(like="kaiju").columns.values.tolist()
-    for item in group_one:
-        hmm_cols = df.filter(like=item).columns.values.tolist()
-        #TODO might not need
-        test_df = df[kaiju_cols+hmm_cols+samples].copy()
-        test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)]
+    # group_one=["tigrfams","hamap","dbcan","kaiju"]
+    # for item in group_one:
+    hmm_cols = df.filter(like=group_on[0]).columns.values.tolist()
+    #TODO might not need
+    test_df = df[hmm_cols+samples].copy()
+    if "kaiju" in item:
+        test_df = test_df[(df["kaiju_length"] > min_len)]
         test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx]))
-        metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist()
-        metric_cols.append("kaiju_length")
-        test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index()
+        test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index()
         test_df = test_df.rename(
             columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"}
         )
-        test_df.to_csv("taxonomy_"+item, sep="\t", index=False)
+    else:
+        metric_cols = test_df.filter(items=[group_on[0]+"_score",group_on[0]+"_evalue"]).columns.values.tolist()
+        test_df = test_df[(test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)]
+        test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head()
+    test_df.to_csv(item+output, sep="\t", index=False)
 
 
-def main(output, table, tax_level, min_evalue, min_score, min_len):
+def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output):
+    split_idx = TAX_LEVELS[tax_level]
+    # group_one=["tigrfams","hamap","dbcan"]
+    kaiju_cols = df.filter(like="kaiju").columns.values.tolist()
+    # for item in group_one:
+    hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist()
+    #TODO might not need
+    test_df = df[kaiju_cols+hmm_cols+samples].copy()
+    test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0]+"_score"] > min_score) & (test_df[group_on[0]+"_evalue"] < min_evalue)]
+    test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx]))
+    metric_cols = test_df.filter(items=[group_on[0].lower()+"_score",group_on[0].lower()+"_evalue"]).columns.values.tolist()
+    metric_cols.append("kaiju_length")
+    test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index()
+    test_df = test_df.rename(
+        columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"}
+    )
+    test_df.to_csv(output, sep="\t", index=False)
+
+
+def main(output, table, tax_level, min_evalue, min_score, min_len,group_on):
     df = pd.read_table(table)
     # remove the rows w/ no information
-    df = df.query("kaiju_length != 0")
+    df = df.dropna(
+    subset=[
+        "kaiju_taxonomy", "tigrfams_ec", "tigrfams_gene", "tigrfams_product",
+        "tigrfams_score", "tigrfams_evalue", "hamap_ec", "hamap_gene",
+        "hamap_product", "hamap_score", "hamap_evalue", "dbcan_ec",
+        "dbcan_enzyme_class", "dbcan_enzyme_class_subfamily"
+    ],
+    thresh=1)
     samples = df.filter(regex="[0-9]+").columns.values.tolist()
-    split_idx = TAX_LEVELS[tax_level]
-    group_one=["tigrfams","hamap","dbcan","kaiju"]
-    for item in group_one:
-        print(item)
-        use_these = df.filter(like=item).columns.values.tolist()
-        print(use_these)
-        #TODO might not need
-        test_df = df[use_these+samples].copy()
-        if "kaiju" in item:
-            test_df = test_df[(df["kaiju_length"] > min_len)]
-            test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx]))
-            test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index()
-            test_df = test_df.rename(
-                columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"}
-            )
-        else:
-            metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist()
-            test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)]
-            test_df = test_df.drop(columns=metric_cols).groupby(use_these[0:3]).sum(axis=1).reset_index().head()
-        test_df.to_csv(item+output, sep="\t", index=False)
+    if len(group_on) > 1:
+        combined_output(df,tax_level,min_evalue, min_score, min_len, group_on,output)
+    else:
+        single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output)
+    # split_idx = TAX_LEVELS[tax_level]
+    # group_one=["tigrfams","hamap","dbcan","kaiju"]
+    # for item in group_one:
+    #     print(item)
+    #     use_these = df.filter(like=item).columns.values.tolist()
+    #     print(use_these)
+    #     #TODO might not need
+    #     test_df = df[use_these+samples].copy()
+    #     if "kaiju" in item:
+    #         test_df = test_df[(df["kaiju_length"] > min_len)]
+    #         test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx]))
+    #         test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index()
+    #         test_df = test_df.rename(
+    #             columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"}
+    #         )
+    #     else:
+    #         metric_cols = test_df.filter(items=[item+"_score",item+"_evalue"]).columns.values.tolist()
+    #         test_df = test_df[(test_df[item+"_score"] > min_score) & (test_df[item+"_evalue"] < min_evalue)]
+    #         test_df = test_df.drop(columns=metric_cols).groupby(use_these[0:3]).sum(axis=1).reset_index().head()
+    #     test_df.to_csv(item+output, sep="\t", index=False)
     # tax_split_level = TAX_LEVELS[tax_level]
     # sample_df = None
     # logging.debug(f"Preparing to parse {len(tables)} tables")
@@ -185,13 +196,13 @@ def main(output, table, tax_level, min_evalue, min_score, min_len):
         default=0.001,
         help="minimum kaiju alignment length to retain",
     )
-    # parser.add_argument(
-    #     "--group-on",
-    #     # choices=["kaiju_classification", "ko", "ec", "product"],
-    #     nargs="+",
-    #     default="kaiju_taxonomy",
-    #     help="cluster tables on taxonomy and/or function; headers must match in tables",
-    # )
+    parser.add_argument(
+        "--group-on",
+        # choices=["kaiju_classification", "ko", "ec", "product"],
+        nargs="+",
+        default="kaiju_taxonomy",
+        help="cluster tables on taxonomy and/or function; headers must match in tables",
+    )
     parser.add_argument(
         "--verbose", action="store_true", help="increase output verbosity"
     )
@@ -205,5 +216,5 @@ def main(output, table, tax_level, min_evalue, min_score, min_len):
         args.min_evalue,
         args.min_score,
         args.min_len,
-        # args.group_on,
+        args.group_on,
     )

From 69e6f0eb9208a3e883606afdef957d54db62fa02 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 27 Nov 2018 16:22:59 -0800
Subject: [PATCH 03/14] fix error in combined output rule

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index cedabe9..a6a6f00 100644
--- a/Snakefile
+++ b/Snakefile
@@ -681,7 +681,7 @@ rule build_hmm_and_tax_table:
         python scripts/summarize_classifications.py \
             --group-on {wildcards.hmm} kaiju_classification \
             --tax-level {wildcards.tax_classification} --min-evalue {params.min_evalue} \
-            --min-score {params.min_score}--min-len {params.min_len} {output} {input}
+            --min-score {params.min_score} --min-len {params.min_len} {output} {input}
         """
 
 

From 2e8b7821c10bcbd5dc8b648f77fcbf5251fb691f Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 27 Nov 2018 16:23:38 -0800
Subject: [PATCH 04/14] fixes report

---
 scripts/build_report.py | 82 +++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 20 deletions(-)

diff --git a/scripts/build_report.py b/scripts/build_report.py
index ace0eaa..e63bb73 100644
--- a/scripts/build_report.py
+++ b/scripts/build_report.py
@@ -1,4 +1,4 @@
-import argparse
+ import argparse
 import csv
 import os
 from collections import Counter, defaultdict
@@ -45,16 +45,17 @@ def get_sample_order(lst):
     return [i[0] for i in sorted(lst, key=lambda x: x[1], reverse=True)]
 
 
-def build_taxonomy_plot(txt, value_cols, height=900):
+def build_taxonomy_plot(txt, height=900):
     df = pd.read_table(txt)
+    samples = df.filter(regex="[0-9]+").columns.values.tolist()
     levels = ["kingdom", "phylum", "class", "order"]
     df[levels] = df["taxonomy_order"].str.split(";", expand=True)
     hierarchy = ["phylum", "class", "order"]
     df[hierarchy] = df[hierarchy].fillna("NA")
-    df[value_cols] = df[value_cols].fillna(0)
+    df[samples] = df[samples].fillna(0)
     df = df[hierarchy + value_cols]
     dfs = relatively.get_dfs_across_hierarchy(
-        df, hierarchy, value_cols, reorder="shannon", dependent="; "
+        df, hierarchy, samples, reorder="shannon", dependent="; "
     )
     fig = relatively.get_abundance_figure_from_dfs(
         dfs, hierarchy, "Assigned Taxonomy Per Sample", height=height
@@ -66,11 +67,47 @@ def get_sample_name(path, key):
     return [os.path.basename(item).partition(key)[0] for item in path]
 
 
+# def parse_classifications_for_taxonomy(path):
+#     """
+#     SN1035:381:h3cv7bcx2:2:2202:3165:73750	-1	-1
+#     SN1035:381:h3cv7bcx2:1:1105:7280:88523	67.5	40				111	Archaea; Thaumarchaeota; Nitrososphaerales; Nitrososphaeria; Nitrososphaeraceae; Candidatus Nitrosocosmicus; Candidatus Nitrocosmicus oleophilus;
+#     SN1035:381:h3cv7bcx2:1:1204:20140:83036	78.4	37	ko:K00820	glmS, GFPT; glucosamine---fructose-6-phosphate aminotransferase (isomerizing)	2.6.1.16	229	Bacteria; Actinobacteria; NA; NA; NA; NA; Actinobacteria bacterium 13_1_40CM_66_12;
+#     """
+#     logger.info("Parsing {}".format(path))
+#     # hardcoded tax levels :(
+#     taxonomy_level_counter = {
+#         "order": Counter(),
+#         "class": Counter(),
+#         "phylum": Counter(),
+#     }
+#     taxonomy_counter = Counter()
+#     summary_counter = Counter()
+#     with open(path) as fh:
+#         # skip the header
+#         next(fh)
+#         for i, line in enumerate(fh, start=1):
+#             toks = line.strip("\r\n").split("\t")
+#             if toks[3]:
+#                 summary_counter.update(["Assigned Function"])
+#                 if toks[7]:
+#                     summary_counter.update(["Assigned Both"])
+#             if toks[7]:
+#                 taxonomy_counter.update([toks[7]])
+#                 taxonomy = [j.strip() for j in toks[7].split(";")]
+#                 taxonomy_level_counter["order"].update([taxonomy[3]])
+#                 taxonomy_level_counter["class"].update([taxonomy[2]])
+#                 taxonomy_level_counter["phylum"].update([taxonomy[1]])
+#                 summary_counter.update(["Assigned Taxonomy"])
+#     return dict(
+#         taxonomy_level_counter=taxonomy_level_counter,
+#         summary_counter=summary_counter
+#     )
+
 def parse_classifications_for_taxonomy(path):
     """
-    SN1035:381:h3cv7bcx2:2:2202:3165:73750	-1	-1
-    SN1035:381:h3cv7bcx2:1:1105:7280:88523	67.5	40				111	Archaea; Thaumarchaeota; Nitrososphaerales; Nitrososphaeria; Nitrososphaeraceae; Candidatus Nitrosocosmicus; Candidatus Nitrocosmicus oleophilus;
-    SN1035:381:h3cv7bcx2:1:1204:20140:83036	78.4	37	ko:K00820	glmS, GFPT; glucosamine---fructose-6-phosphate aminotransferase (isomerizing)	2.6.1.16	229	Bacteria; Actinobacteria; NA; NA; NA; NA; Actinobacteria bacterium 13_1_40CM_66_12;
+    37056   274     Bacteria; Bacteroidetes; Sphingobacteriia; Sphingobacteriales; NA; NA; Sphingobacteriales bacterium UTBCD1;     3.4.21.92       clpP    ATP-dependent Clp endopeptidase proteolytic subun
+    75331   0                                                                                                                                       0       0       0       0       0       0       0       0
+
     """
     logger.info("Parsing {}".format(path))
     # hardcoded tax levels :(
@@ -86,32 +123,37 @@ def parse_classifications_for_taxonomy(path):
         next(fh)
         for i, line in enumerate(fh, start=1):
             toks = line.strip("\r\n").split("\t")
-            if toks[3]:
-                summary_counter.update(["Assigned Function"])
-                if toks[7]:
-                    summary_counter.update(["Assigned Both"])
-            if toks[7]:
+            if toks[2]:
                 taxonomy_counter.update([toks[7]])
                 taxonomy = [j.strip() for j in toks[7].split(";")]
                 taxonomy_level_counter["order"].update([taxonomy[3]])
                 taxonomy_level_counter["class"].update([taxonomy[2]])
                 taxonomy_level_counter["phylum"].update([taxonomy[1]])
                 summary_counter.update(["Assigned Taxonomy"])
+                if not all(s=='' for s in toks[4:12]):
+                    summary_counter.update(["Assigned Both"])
+            if not all(s=='' for s in toks[4:12]):
+                summary_counter.update(["Assigned Function"])
+
     return dict(
         taxonomy_level_counter=taxonomy_level_counter,
         summary_counter=summary_counter
     )
 
-
 def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]):
     """
     Reads in multiple sample alignments from diamond in a given directory and merges them into
-    a single pandas.DataFrame. It returns a pandas dataframe for each of th
+    a single pandas.DataFrame. It returns a pandas dataframe for each of
     phylum that is ready to plug into the processing function. Also returns total counts
     which is necessary to calculate the percentage of total that is being represented
     """
     dfs = {}
     classifications_per_sample = {}
+    cols = df.filter(regex="[A-Z]+").columns.values.tolist()
+    samples = df.filter(regex="[0-9]+").columns.values.tolist()
+    for sample in samples:
+        df[cols]
+
     for classification_table in classification_tables:
         sample = get_sample(classification_table, "_classifications.txt")
         parsed_taxonomy = parse_classifications_for_taxonomy(classification_table)
@@ -311,14 +353,14 @@ def main(
     clean_logs = glob(clean_logs)
     unique_logs = glob(unique_logs)
     merge_logs = glob(merge_logs)
-    summary_tables = glob(summary_tables)
+    summary_table = summary_tables
     r1_quality_files = glob(r1_quality_files)
-    classifications_per_sample = compile_summary_df(summary_tables)
-    value_cols = get_sample_name(summary_tables, "_classifications.txt")
-    fig = build_taxonomy_plot(taxonomy_table, value_cols)
+    #classifications_per_sample = compile_summary_df(summary_tables)
+    #value_cols = get_sample_name(summary_tables, "_classifications.txt")
+    fig = build_taxonomy_plot(taxonomy_table)
     plots = offline.plot(fig, **PLOTLY_PARAMS)
     html_tbl = parse_log_files(
-        merge_logs, unique_logs, clean_logs, classifications_per_sample
+        merge_logs, unique_logs, clean_logs, summary_table
     )
     quality_plot = build_quality_plot(r1_quality_files)
     conda_env = get_conda_env_str(conda_env)
@@ -333,7 +375,7 @@ def main(
 PerSeq_ - Per sequence functional and taxonomic assignments
 =============================================================
 
-.. _PerSeq: https://github.com/pnnl
+.. _PerSeq: https://github.com/PNNL-CompBio/perseq
 
 
 .. contents::

From c85d26a3c0300264bc0ce9944ed4f096b4b17e8c Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 27 Nov 2018 16:24:09 -0800
Subject: [PATCH 05/14] fixes evalue arg type

---
 scripts/summarize_classifications.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py
index eb0f89e..46baf37 100755
--- a/scripts/summarize_classifications.py
+++ b/scripts/summarize_classifications.py
@@ -192,7 +192,7 @@ def main(output, table, tax_level, min_evalue, min_score, min_len,group_on):
     )
     parser.add_argument(
         "--min-len",
-        type=int,
+        type=float,
         default=0.001,
         help="minimum kaiju alignment length to retain",
     )

From 61af7ee141d3511ea6799693dbf5ef1f38ad9dd8 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 27 Nov 2018 17:33:44 -0800
Subject: [PATCH 06/14] adds rule to remove empty annotations

---
 Snakefile | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index a6a6f00..697ac5f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -621,9 +621,22 @@ rule combine_sample_output:
         """
 
 
+rule remove_empty_annotations:
+    input:
+        annotations = "gene_catalog/annotations.txt"
+    output:
+        cleaned = "gene_catalog/annotations_cleaned.txt"
+    run:
+        with open(input.annotations) as file,open(output.cleaned,"w") as output:
+            next(file)
+            for line in file:
+                toks = line.strip("\r\n").split("\t")
+                if not all(s=='' for s in toks[4:12]):
+                    print(line, end="\n", file=output)
+
 rule build_hmms_table:
     input:
-        "gene_catalog/annotations.txt"
+        "gene_catalog/annotations_cleaned.txt"
     output:
         "summaries/hmms_summary/{hmm}_summary.txt"
     params:
@@ -645,7 +658,7 @@ rule build_hmms_table:
 
 rule build_tax_table:
     input:
-        "gene_catalog/annotations.txt"
+        "gene_catalog/annotations_cleaned.txt"
     output:
         "summaries/taxonomy/{tax_classification}.txt"
     params:
@@ -667,7 +680,7 @@ rule build_tax_table:
 
 rule build_hmm_and_tax_table:
     input:
-        "gene_catalog/annotations.txt"
+        "gene_catalog/annotations_cleaned.txt"
     output:
         "summaries/combined/{hmm}_{tax_classification}.txt"
     params:
@@ -749,7 +762,7 @@ rule zip_attachments:
 
 rule build_report:
     input:
-        annotations = "gene_catalog/annotations.txt",
+        annotations = "gene_catalog/annotations_cleaned.txt",
         ee_stats = expand("logs/{sample}_{idx}_eestats.txt", sample=config["samples"].keys(), idx=["R1", "R2"]),
         clean_length_logs = expand("logs/{sample}_03_clean_readlengths.txt", sample=config["samples"].keys()),
         unique_length_logs = expand("logs/{sample}_02_unique_readlengths.txt", sample=config["samples"].keys()),

From 3c296831faa229a083af0487a31a67d17305af06 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Mon, 3 Dec 2018 15:42:29 -0800
Subject: [PATCH 07/14] fixed annotations parsing

---
 Snakefile                            | 27 ++++++++-------
 scripts/build_report.py              | 50 +++++++++++++++++-----------
 scripts/summarize_classifications.py | 31 +++++++++--------
 3 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/Snakefile b/Snakefile
index 697ac5f..e2835ca 100644
--- a/Snakefile
+++ b/Snakefile
@@ -136,12 +136,12 @@ def get_summaries():
     hmm = ["TIGRFAMs","HAMAP","dbCAN"]
     taxonomy = ["phylum", "class", "order"]
     # code to generate the possible files
-    file_paths = expand("summaries/combined/{hmm}_{taxonomy}.txt",
-        hmm=hmm, taxonomy=taxonomy)
+    file_paths = expand("summaries/combined/{hmm}_{tax_classification}.txt",
+        hmm=hmm, tax_classification=taxonomy)
     file_paths.extend(expand("summaries/hmms_summary/{hmm}_summary.txt",
         hmm=hmm))
-    file_paths.extend(expand("summaries/taxonomy/{taxonomy}.txt",
-        taxonomy=taxonomy))
+    file_paths.extend(expand("summaries/taxonomy/{tax_classification}.txt",
+        tax_classification=taxonomy))
     return file_paths
 
 
@@ -627,12 +627,13 @@ rule remove_empty_annotations:
     output:
         cleaned = "gene_catalog/annotations_cleaned.txt"
     run:
-        with open(input.annotations) as file,open(output.cleaned,"w") as output:
-            next(file)
+        with open(input.annotations) as file, open(output.cleaned,"w") as output:
+            header = next(file)
+            print(header.strip("\n"), file=output)
             for line in file:
                 toks = line.strip("\r\n").split("\t")
                 if not all(s=='' for s in toks[4:12]):
-                    print(line, end="\n", file=output)
+                    print(line.strip("\n"), file=output)
 
 rule build_hmms_table:
     input:
@@ -672,7 +673,7 @@ rule build_tax_table:
     shell:
         """
         python scripts/summarize_classifications.py \
-            --group-on kaiju_classification --min-evalue {params.min_evalue} \
+            --group-on kaiju_taxonomy --min-evalue {params.min_evalue} \
             --min-score {params.min_score} --min-len {params.min_len} \
             --tax-level {wildcards.tax_classification} {output} {input}
         """
@@ -692,7 +693,7 @@ rule build_hmm_and_tax_table:
     shell:
         """
         python scripts/summarize_classifications.py \
-            --group-on {wildcards.hmm} kaiju_classification \
+            --group-on {wildcards.hmm} kaiju_taxonomy \
             --tax-level {wildcards.tax_classification} --min-evalue {params.min_evalue} \
             --min-score {params.min_score} --min-len {params.min_len} {output} {input}
         """
@@ -752,10 +753,12 @@ rule zip_attachments:
         krona_tax = "krona_plots/tax.krona.html",
         krona_ec = "krona_plots/ec.krona.html"
     output:
-        temp("perseq_downloads.zip")
+        temp("perseq_downloads.tar.gz")
+    conda:
+        CONDAENV
     shell:
         """
-        zip {output} {input.function} {input.taxonomy} {input.combined} \
+        tar -czf {output} {input.function} {input.taxonomy} {input.combined} \
         {input.krona_tax} {input.krona_ec}
         """
 
@@ -769,7 +772,7 @@ rule build_report:
         clean_logs = expand("logs/{sample}_decontamination.log", sample=config["samples"].keys()),
         merge_logs = expand("logs/{sample}_merge_sequences.log", sample=config["samples"].keys()),
         taxonomy = "summaries/taxonomy/order.txt",
-        zipped_files = "perseq_downloads.zip"
+        zipped_files = "perseq_downloads.tar.gz"
     output:
         "summary.html"
     conda:
diff --git a/scripts/build_report.py b/scripts/build_report.py
index e63bb73..496073d 100644
--- a/scripts/build_report.py
+++ b/scripts/build_report.py
@@ -1,4 +1,4 @@
- import argparse
+import argparse
 import csv
 import os
 from collections import Counter, defaultdict
@@ -53,7 +53,7 @@ def build_taxonomy_plot(txt, height=900):
     hierarchy = ["phylum", "class", "order"]
     df[hierarchy] = df[hierarchy].fillna("NA")
     df[samples] = df[samples].fillna(0)
-    df = df[hierarchy + value_cols]
+    df = df[hierarchy + samples]
     dfs = relatively.get_dfs_across_hierarchy(
         df, hierarchy, samples, reorder="shannon", dependent="; "
     )
@@ -122,19 +122,22 @@ def parse_classifications_for_taxonomy(path):
         # skip the header
         next(fh)
         for i, line in enumerate(fh, start=1):
-            toks = line.strip("\r\n").split("\t")
-            if toks[2]:
-                taxonomy_counter.update([toks[7]])
-                taxonomy = [j.strip() for j in toks[7].split(";")]
-                taxonomy_level_counter["order"].update([taxonomy[3]])
-                taxonomy_level_counter["class"].update([taxonomy[2]])
-                taxonomy_level_counter["phylum"].update([taxonomy[1]])
-                summary_counter.update(["Assigned Taxonomy"])
-                if not all(s=='' for s in toks[4:12]):
-                    summary_counter.update(["Assigned Both"])
+            toks = line.strip("\n").split("\t")
+            try:
+                if toks[2]:
+                    taxonomy_counter.update([toks[2]])
+                    taxonomy = [j.strip() for j in toks[2].split(";")]
+                    taxonomy_level_counter["order"].update([taxonomy[3]])
+                    taxonomy_level_counter["class"].update([taxonomy[2]])
+                    taxonomy_level_counter["phylum"].update([taxonomy[1]])
+                    summary_counter.update(["Assigned Taxonomy"])
+                    if not all(s=='' for s in toks[4:12]):
+                        summary_counter.update(["Assigned Both"])
+            except:
+                print(len(line))
+                continue
             if not all(s=='' for s in toks[4:12]):
                 summary_counter.update(["Assigned Function"])
-
     return dict(
         taxonomy_level_counter=taxonomy_level_counter,
         summary_counter=summary_counter
@@ -215,6 +218,7 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam
         "Unique",
         "Clean",
     ]
+    samp_df = pd.DataFrame.from_dict(classifications_per_sample)
     count_table = defaultdict(list)
     # initial read count, join count, join rate, insert size from merge step
     for merge_log in merge_logs:
@@ -238,9 +242,12 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam
                     count_table[sample].append(int(line.strip("\r\n").split("\t")[-1]))
 
     log_df = pd.DataFrame.from_dict(count_table, orient="index")
+    print(log_df.head(10))
+    print(log_df.shape)
+    print(samp_df.shape)
     log_df.columns = header
     # print("classifications", classifications_per_sample.head())
-    log_df = log_df.merge(classifications_per_sample, left_index=True, right_index=True)
+    log_df = log_df.merge(samp_df, left_index=True, right_index=True)
     # print(log_df.head())
     log_df.reset_index(inplace=True)
     header.insert(0, "Sample")
@@ -347,20 +354,25 @@ def main(
     r1_quality_files,
     html,
     conda_env,
-    function_table,
+    taxonomy_table,
     zipped_file
 ):
     clean_logs = glob(clean_logs)
     unique_logs = glob(unique_logs)
     merge_logs = glob(merge_logs)
     summary_table = summary_tables
+    # parse classifcations for summary
+    assigned_dict = parse_classifications_for_taxonomy(summary_table)["summary_counter"]
     r1_quality_files = glob(r1_quality_files)
-    #classifications_per_sample = compile_summary_df(summary_tables)
+    #classifications_per_sample = compile_summary_df(summary_table)
     #value_cols = get_sample_name(summary_tables, "_classifications.txt")
     fig = build_taxonomy_plot(taxonomy_table)
     plots = offline.plot(fig, **PLOTLY_PARAMS)
+    # html_tbl = parse_log_files(
+    #     merge_logs, unique_logs, clean_logs, classifications_per_sample
+    # )
     html_tbl = parse_log_files(
-        merge_logs, unique_logs, clean_logs, summary_table
+        merge_logs, unique_logs, clean_logs, assigned_dict
     )
     quality_plot = build_quality_plot(r1_quality_files)
     conda_env = get_conda_env_str(conda_env)
@@ -572,7 +584,7 @@ def main(
     p.add_argument("--r1-quality-files")
     p.add_argument("--html")
     p.add_argument("conda_env")
-    p.add_argument("function_table")
+    p.add_argument("taxonomy_table")
     p.add_argument("zipped_file")
     # p.add_argument("taxonomy_table")
     # p.add_argument("taxonomy_function_table")
@@ -587,7 +599,7 @@ def main(
         args.r1_quality_files,
         args.html,
         args.conda_env,
-        args.function_table,
+        args.taxonomy_table,
         args.zipped_file,
         # args.taxonomy_table,
         # args.taxonomy_function_table,
diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py
index 46baf37..4721468 100755
--- a/scripts/summarize_classifications.py
+++ b/scripts/summarize_classifications.py
@@ -57,25 +57,28 @@
 #     logging.debug(f"Final table length: {len(df)}")
 #     return df
 
-def single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output):
+def single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output):
     split_idx = TAX_LEVELS[tax_level]
     # group_one=["tigrfams","hamap","dbcan","kaiju"]
-    # for item in group_one:
-    hmm_cols = df.filter(like=group_on[0]).columns.values.tolist()
-    #TODO might not need
-    test_df = df[hmm_cols+samples].copy()
-    if "kaiju" in item:
+    # for item in group_on
+    #test_df.head()
+    if "kaiju_taxonomy" in group_on:
+        hmm_cols = df.filter(like="kaiju").columns.values.tolist()
+        test_df = df[hmm_cols+samples].copy()
         test_df = test_df[(df["kaiju_length"] > min_len)]
-        test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx]))
+        test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx]))
         test_df = test_df.drop(columns=["kaiju_length"]).groupby(["kaiju_taxonomy"]).sum(axis=1).reset_index()
         test_df = test_df.rename(
             columns={"kaiju_taxonomy": f"taxonomy_{tax_level}"}
         )
     else:
+        hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist()
+        #TODO might not need
+        test_df = df[hmm_cols+samples].copy()
         metric_cols = test_df.filter(items=[group_on[0]+"_score",group_on[0]+"_evalue"]).columns.values.tolist()
         test_df = test_df[(test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)]
         test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head()
-    test_df.to_csv(item+output, sep="\t", index=False)
+    test_df.to_csv(output, sep="\t", index=False)
 
 
 def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output):
@@ -86,8 +89,8 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o
     hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist()
     #TODO might not need
     test_df = df[kaiju_cols+hmm_cols+samples].copy()
-    test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0]+"_score"] > min_score) & (test_df[group_on[0]+"_evalue"] < min_evalue)]
-    test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(x.split("; ", split_idx)[0:split_idx]))
+    test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)]
+    test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx]))
     metric_cols = test_df.filter(items=[group_on[0].lower()+"_score",group_on[0].lower()+"_evalue"]).columns.values.tolist()
     metric_cols.append("kaiju_length")
     test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index()
@@ -98,7 +101,7 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o
 
 
 def main(output, table, tax_level, min_evalue, min_score, min_len,group_on):
-    df = pd.read_table(table)
+    df = pd.read_table(table,low_memory=False)
     # remove the rows w/ no information
     df = df.dropna(
     subset=[
@@ -110,9 +113,9 @@ def main(output, table, tax_level, min_evalue, min_score, min_len,group_on):
     thresh=1)
     samples = df.filter(regex="[0-9]+").columns.values.tolist()
     if len(group_on) > 1:
-        combined_output(df,tax_level,min_evalue, min_score, min_len, group_on,output)
+        combined_output(df,tax_level,samples,min_evalue, min_score, min_len, group_on,output)
     else:
-        single_output(df,tax_level,samples,min_evlaue,min_score,min_len,group_on,output)
+        single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,output)
     # split_idx = TAX_LEVELS[tax_level]
     # group_one=["tigrfams","hamap","dbcan","kaiju"]
     # for item in group_one:
@@ -180,7 +183,7 @@ def main(output, table, tax_level, min_evalue, min_score, min_len,group_on):
     )
     parser.add_argument(
         "--min-evalue",
-        type=int,
+        type=float,
         default=0.001,
         help="lowest evalue to retain per sequence per sample",
     )

From 848e36889b50ba5ef690fc013f424a56fb3501b3 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 4 Dec 2018 10:53:13 -0800
Subject: [PATCH 08/14] fixed cleaning annotaitons

---
 Snakefile               |  2 +-
 scripts/build_report.py | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index e2835ca..ff8a699 100644
--- a/Snakefile
+++ b/Snakefile
@@ -632,7 +632,7 @@ rule remove_empty_annotations:
             print(header.strip("\n"), file=output)
             for line in file:
                 toks = line.strip("\r\n").split("\t")
-                if not all(s=='' for s in toks[4:12]):
+                if not all(s=='' for s in toks[2:12]):
                     print(line.strip("\n"), file=output)
 
 rule build_hmms_table:
diff --git a/scripts/build_report.py b/scripts/build_report.py
index 496073d..18fb71f 100644
--- a/scripts/build_report.py
+++ b/scripts/build_report.py
@@ -142,6 +142,39 @@ def parse_classifications_for_taxonomy(path):
         taxonomy_level_counter=taxonomy_level_counter,
         summary_counter=summary_counter
     )
+def parse_files_for_annotation(path):
+    """
+    parses the annotations to retain which samples were assigned function vs
+    taxonomy vs both
+    """
+    summary_counter={}
+    with open(path) as fh:
+        # skip the header
+        header = next(fh)
+        header = header.strip("\n").split("\t")
+        for sample in header[18:]:
+            summary_counter[sample]=Counter()
+        for line in fh:
+            toks = line.strip("\n").split("\t")
+            try:
+                if toks[2]:
+                    for i,sample in enumerate(header[18:]):
+                        if toks[18+i] != 0:
+                            summary_counter[sample].update(["Assigned Taxonomy"])
+                    summary_counter.update()
+                    if not all(s=='' for s in toks[3:12]):
+                        for i,sample in enumerate(header[18:]):
+                            if toks[18+i] != 0:
+                                summary_counter[sample].update(["Assigned Both"])
+                        #summary_counter.update(["Assigned Both"])
+            except:
+                print(len(line))
+                continue
+            if not all(s=='' for s in toks[4:12]):
+                for i,sample in enumerate(header[18:]):
+                    if toks[18+i] != 0:
+                        summary_counter[sample].update(["Assigned Function"])
+    return summary_counter
 
 def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]):
     """
@@ -218,7 +251,7 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam
         "Unique",
         "Clean",
     ]
-    samp_df = pd.DataFrame.from_dict(classifications_per_sample)
+    samp_df = pd.DataFrame.from_dict(classifications_per_sample,orient = "index")
     count_table = defaultdict(list)
     # initial read count, join count, join rate, insert size from merge step
     for merge_log in merge_logs:
@@ -362,7 +395,7 @@ def main(
     merge_logs = glob(merge_logs)
     summary_table = summary_tables
     # parse classifcations for summary
-    assigned_dict = parse_classifications_for_taxonomy(summary_table)["summary_counter"]
+    assigned_dict = parse_files_for_annotation(summary_table)
     r1_quality_files = glob(r1_quality_files)
     #classifications_per_sample = compile_summary_df(summary_table)
     #value_cols = get_sample_name(summary_tables, "_classifications.txt")

From 55284a6186367561909e039e8f3e9ca9860f94ac Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 4 Dec 2018 13:50:10 -0800
Subject: [PATCH 09/14] fixed summary table metrics

---
 scripts/build_report.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/build_report.py b/scripts/build_report.py
index 18fb71f..daadd40 100644
--- a/scripts/build_report.py
+++ b/scripts/build_report.py
@@ -159,12 +159,12 @@ def parse_files_for_annotation(path):
             try:
                 if toks[2]:
                     for i,sample in enumerate(header[18:]):
-                        if toks[18+i] != 0:
+                        if str(toks[18+i]) != 0:
                             summary_counter[sample].update(["Assigned Taxonomy"])
                     summary_counter.update()
                     if not all(s=='' for s in toks[3:12]):
                         for i,sample in enumerate(header[18:]):
-                            if toks[18+i] != 0:
+                            if str(toks[18+i]) != 0:
                                 summary_counter[sample].update(["Assigned Both"])
                         #summary_counter.update(["Assigned Both"])
             except:
@@ -172,7 +172,7 @@ def parse_files_for_annotation(path):
                 continue
             if not all(s=='' for s in toks[4:12]):
                 for i,sample in enumerate(header[18:]):
-                    if toks[18+i] != 0:
+                    if str(toks[18+i]) != 0:
                         summary_counter[sample].update(["Assigned Function"])
     return summary_counter
 

From f2e164003486b7d0aa4307b998f14681aafaadc4 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Tue, 4 Dec 2018 15:41:25 -0800
Subject: [PATCH 10/14] updates reltaively version

---
 envs/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/environment.yml b/envs/environment.yml
index c8977a0..1ea37c7 100644
--- a/envs/environment.yml
+++ b/envs/environment.yml
@@ -19,5 +19,5 @@ dependencies:
     - seqtk=1.3
     - pip
     - pip:
-      - relatively
+      - relatively>=1.0.5
       - plotly>=3.0

From 83c64f8905a94bd72d98dd2fe440e2a0327f5e7d Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Wed, 5 Dec 2018 09:38:12 -0800
Subject: [PATCH 11/14] remove temp on zipped file

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index ff8a699..4fe7acc 100644
--- a/Snakefile
+++ b/Snakefile
@@ -753,7 +753,7 @@ rule zip_attachments:
         krona_tax = "krona_plots/tax.krona.html",
         krona_ec = "krona_plots/ec.krona.html"
     output:
-        temp("perseq_downloads.tar.gz")
+        "perseq_downloads.tar.gz"
     conda:
         CONDAENV
     shell:

From ff90f7546f657683e8d94aef1a9113e99308eab9 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Wed, 5 Dec 2018 09:38:39 -0800
Subject: [PATCH 12/14] fixes column names

---
 scripts/build_report.py | 191 ++++++++++++++++++++++++----------------
 1 file changed, 114 insertions(+), 77 deletions(-)

diff --git a/scripts/build_report.py b/scripts/build_report.py
index daadd40..f5d4fb2 100644
--- a/scripts/build_report.py
+++ b/scripts/build_report.py
@@ -47,7 +47,7 @@ def get_sample_order(lst):
 
 def build_taxonomy_plot(txt, height=900):
     df = pd.read_table(txt)
-    samples = df.filter(regex="[0-9]+").columns.values.tolist()
+    samples = df.columns.values.tolist()[1:]
     levels = ["kingdom", "phylum", "class", "order"]
     df[levels] = df["taxonomy_order"].str.split(";", expand=True)
     hierarchy = ["phylum", "class", "order"]
@@ -160,12 +160,12 @@ def parse_files_for_annotation(path):
                 if toks[2]:
                     for i,sample in enumerate(header[18:]):
                         if str(toks[18+i]) != 0:
-                            summary_counter[sample].update(["Assigned Taxonomy"])
-                    summary_counter.update()
-                    if not all(s=='' for s in toks[3:12]):
-                        for i,sample in enumerate(header[18:]):
-                            if str(toks[18+i]) != 0:
-                                summary_counter[sample].update(["Assigned Both"])
+                            summary_counter[sample].update(["Kaiju+NR"])
+                    #summary_counter.update()
+                    # if not all(s=='' for s in toks[3:12]):
+                    #     for i,sample in enumerate(header[18:]):
+                    #         if str(toks[18+i]) != 0:
+                    #             summary_counter[sample].update(["Assigned Both"])
                         #summary_counter.update(["Assigned Both"])
             except:
                 print(len(line))
@@ -173,43 +173,43 @@ def parse_files_for_annotation(path):
             if not all(s=='' for s in toks[4:12]):
                 for i,sample in enumerate(header[18:]):
                     if str(toks[18+i]) != 0:
-                        summary_counter[sample].update(["Assigned Function"])
+                        summary_counter[sample].update(["TIGRFAMs,HAMAP,dbCAN\n hits per sample"])
     return summary_counter
 
-def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]):
-    """
-    Reads in multiple sample alignments from diamond in a given directory and merges them into
-    a single pandas.DataFrame. It returns a pandas dataframe for each of
-    phylum that is ready to plug into the processing function. Also returns total counts
-    which is necessary to calculate the percentage of total that is being represented
-    """
-    dfs = {}
-    classifications_per_sample = {}
-    cols = df.filter(regex="[A-Z]+").columns.values.tolist()
-    samples = df.filter(regex="[0-9]+").columns.values.tolist()
-    for sample in samples:
-        df[cols]
-
-    for classification_table in classification_tables:
-        sample = get_sample(classification_table, "_classifications.txt")
-        parsed_taxonomy = parse_classifications_for_taxonomy(classification_table)
-        # assigned #'s in summary table
-        classifications_per_sample[sample] = parsed_taxonomy["summary_counter"]
-        if len(dfs) == 0:
-            for tax_level in tax_levels:
-                dfs[tax_level] = get_df_at_tax_level(
-                    parsed_taxonomy["taxonomy_level_counter"][tax_level],
-                    sample,
-                    tax_level,
-                )
-            continue
-
-        for tax_level in tax_levels:
-            df = get_df_at_tax_level(
-                parsed_taxonomy["taxonomy_level_counter"][tax_level], sample, tax_level
-            )
-            dfs[tax_level] = dfs[tax_level].merge(df, on=tax_level, how="outer")
-    return pd.DataFrame.from_dict(classifications_per_sample, orient="index")
+# def compile_summary_df(classification_tables, tax_levels=["phylum", "class", "order"]):
+#     """
+#     Reads in multiple sample alignments from diamond in a given directory and merges them into
+#     a single pandas.DataFrame. It returns a pandas dataframe for each of
+#     phylum that is ready to plug into the processing function. Also returns total counts
+#     which is necessary to calculate the percentage of total that is being represented
+#     """
+#     dfs = {}
+#     classifications_per_sample = {}
+#     cols = df.filter(regex="[A-Z]+").columns.values.tolist()
+#     samples = df.filter(regex="[0-9]+").columns.values.tolist()
+#     for sample in samples:
+#         df[cols]
+#
+#     for classification_table in classification_tables:
+#         sample = get_sample(classification_table, "_classifications.txt")
+#         parsed_taxonomy = parse_classifications_for_taxonomy(classification_table)
+#         # assigned #'s in summary table
+#         classifications_per_sample[sample] = parsed_taxonomy["summary_counter"]
+#         if len(dfs) == 0:
+#             for tax_level in tax_levels:
+#                 dfs[tax_level] = get_df_at_tax_level(
+#                     parsed_taxonomy["taxonomy_level_counter"][tax_level],
+#                     sample,
+#                     tax_level,
+#                 )
+#             continue
+#
+#         for tax_level in tax_levels:
+#             df = get_df_at_tax_level(
+#                 parsed_taxonomy["taxonomy_level_counter"][tax_level], sample, tax_level
+#             )
+#             dfs[tax_level] = dfs[tax_level].merge(df, on=tax_level, how="outer")
+#     return pd.DataFrame.from_dict(classifications_per_sample, orient="index")
 
 
 def get_df_at_tax_level(count_obj, sample, tax_level):
@@ -472,33 +472,54 @@ def main(
 specified during the decontamination process. Functional annotation and
 taxonomic classification were performed following the decontamination step.
 
+Translation and Clustering
+**************************
+
+Raw nucleotide sequences are translated into protein sequences using Prodigal[4]
+in anonymous mode. These sequences are then clustered at a 90% sequence identity
+using mmseqs[5], to decrease the computational resources required to perform
+annotation. These representative sequences are then annotated for functional
+and taxonomic information.
+
+
 Functional Annotation
 *********************
 
-The blastx algorithm of DIAMOND [4] was used to align nucleotide sequences to
-the KEGG protein reference database [5] consisting of non-redundant, family
-level fungal eukaryotes and genus level prokaryotes
-(``--strand=both --evalue 0.00001``). The highest scoring alignment per
-sequence was used for functional annotation.
+There are three distinct hmm libraries used to infer functional potential:
+HAMAP[6], dbCAN[7], and TIRGRFAMs[8]. The alignment of each seuqnce to each database
+was performed using HMMSearch of HMMer[9]. TIGRFAMs and HAMAP both provide
+enzyme commission (EC), gene and product, while dbCAN2 provides enzyme class,
+enzyme subclass and EC.
 
 Taxonomic Annotation
 ********************
 
 Kmer-based taxonomic classification was performed on the merged reads using
-Kaiju [6] in greedy mode (``-a greedy -E 0.05``). NCBI's nr database [7]
+Kaiju [10] in greedy mode (``-a greedy -E 0.05``). NCBI's nr database [11]
 containing reference sequences for archaea, bacteria, viruses, fungi, and
 microbial eukaryotes was used as the reference index for Kaiju.
 
+Aggregation
+***********
+
+Once functional and taxonomic annotation has completed all of the sequences per
+sample are aligned to the now annotated representative sequences using DIAMOND[12].
+
 References
 **********
 
 1. Rognes T, Flouri T, Nichols B, Quince C, Mahé F. VSEARCH: a versatile open source tool for metagenomics. PeerJ. PeerJ Inc; 2016;4:e2584.
 2. Bushnell B. BBTools [Internet]. Available from: https://sourceforge.net/projects/bbmap/
 3. Li H. Seqtk [Internet]. Available from: https://github.com/lh3/seqtk
-4. Buchfink B, Xie C, Huson DH. Fast and sensitive protein alignment using DIAMOND. Nat. Methods. Nature Publishing Group; 2015;12:59–60.
-5. Kanehisa M, Sato Y, Kawashima M, Furumichi M, Tanabe M. KEGG as a reference resource for gene and protein annotation. Nucleic Acids Res. 2016;44:D457–62.
-6. Menzel P, Ng KL, Krogh A. Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nat Commun. Nature Publishing Group; 2016;7:11257.
-7. NCBI Resource Coordinators. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2018;46:D8–D13.
+4. fd
+5. jdkf
+6. DFRA_DIACA
+7. DFRA_DIACA
+8. def
+9. dk
+10. Menzel P, Ng KL, Krogh A. Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nat Commun. Nature Publishing Group; 2016;7:11257.
+11. NCBI Resource Coordinators. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2018;46:D8–D13.
+12. Buchfink B, Xie C, Huson DH. Fast and sensitive protein alignment using DIAMOND. Nat. Methods. Nature Publishing Group; 2015;12:59–60.
 
 
 Execution Environment
@@ -511,10 +532,10 @@ def main(
 Output
 ------
 
-Classification Tables
+Annotations Table
 *********************
 
-Per sample classifications in tables/ contain:
+Table containing annotations across all samples:
 
 .. table::
     :name: classificationtable
@@ -522,8 +543,22 @@ def main(
     =========================  ==========================================================================================================================================
     Header ID                  Definition
     =========================  ==========================================================================================================================================
-    aa_alignment_length        The length of the DIAMOND blastx hit
-    aa_percent_id              The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency
+    seq
+    kaiju_length
+    kaiju_taxonomy
+    tigrfams_ec
+    tigrfams_gene
+    tigrfams_product
+    tigrfams_score
+    tigrfams_evalue
+    hamap_ec
+    hamap_gene
+    hamap_product
+    hamap_score
+    hamap_evalue
+
+    <hmm>_ec       The length of the DIAMOND blastx hit
+    <hmm>_gene             The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency
     ec                         Enzyme Commission number from KEGG; semicolon delimited where multiple
     ko                         KEGG entry ID
     product                    KEGG gene ID <semicolon> KEGG product
@@ -555,41 +590,43 @@ def main(
 Function
 ````````
 
-Per function assignments in tables named **summaries/function/<type>.txt**
+Per function assignments in tables named **summaries/hmms_summary/<hmm>.txt**
 contain:
 
 .. table::
     :name: functiondeftable
 
-    ====================  ===================================================================
-    Header ID             Definition
-    ====================  ===================================================================
-    <type>                either KO, EC, or product into which counts have been summed
-    samples names         non-normalized, per sample sum for this particular functional group
-    level_1               KEGG hierarchy [level 1] if KO defined in first column
-    level_2               KEGG hierarchy [level 2] if KO defined in first column
-    level_3               KEGG hierarchy [level 3] if KO defined in first column
-    ====================  ===================================================================
+    ====================         ===================================================================
+    Header ID                    Definition
+    ====================         ===================================================================
+    <hmm>_ec                     either HAMAP,dbCAN or TIGRFAMs EC
+    <hmm>_gene                   either HAMAP,dbCAN or TIGRFAMs gene
+    <hmm>_product                either HAMAP,dbCAN or TIGRFAMs product
+    <hmm>_enzyme_class           dbcan only enzyme class descriptor
+    <hmm>_enzyme_class_subfamily dbcan only enzyme class subfamily descriptor
+    samples names                non-normalized, per sample sum for this particular functional group
+    ====================         ===================================================================
 
 Combined
 ````````
 
 Per taxonomy+function assignments in tables named
-**summaries/combined/<type>_<level>.txt** contain:
+**summaries/combined/<hmm>_<level>.txt** contain:
 
 .. table::
     :name: combineddeftable
 
-    ====================  ====================================================================
-    Header ID             Definition
-    ====================  ====================================================================
-    <type>                either KO, EC, or product; counts are summed using <type>+<taxonomy>
-    taxonomy_<level>      taxonomic level; counts are summed using <type>+<taxonomy>
-    sample names          non-normalized, per sample sum for this particular functional group
-    level_1               KEGG hierarchy [level 1] if KO defined in first column
-    level_2               KEGG hierarchy [level 2] if KO defined in first column
-    level_3               KEGG hierarchy [level 3] if KO defined in first column
-    ====================  ====================================================================
+    ====================         ====================================================================
+    Header ID                    Definition
+    ====================         ====================================================================
+    <hmm>_ec                     either HAMAP,dbCAN or TIGRFAMs EC
+    <hmm>_gene                   either HAMAP,dbCAN or TIGRFAMs gene
+    <hmm>_product                either HAMAP,dbCAN or TIGRFAMs product
+    <hmm>_enzyme_class           dbcan only enzyme class descriptor
+    <hmm>_enzyme_class_subfamily dbcan only enzyme class subfamily descriptor
+    taxonomy_<level>             taxonomic level; counts are summed using <hmm>+<taxonomy>
+    sample names                 non-normalized, per sample sum for this particular functional group
+    ====================         ====================================================================
 
 Downloads
 ---------

From c82b642537be26404144ccabfa35e52fb39ce89c Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Wed, 5 Dec 2018 09:39:24 -0800
Subject: [PATCH 13/14] fixes combined function

---
 scripts/summarize_classifications.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/scripts/summarize_classifications.py b/scripts/summarize_classifications.py
index 4721468..8853d67 100755
--- a/scripts/summarize_classifications.py
+++ b/scripts/summarize_classifications.py
@@ -77,7 +77,7 @@ def single_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,out
         test_df = df[hmm_cols+samples].copy()
         metric_cols = test_df.filter(items=[group_on[0]+"_score",group_on[0]+"_evalue"]).columns.values.tolist()
         test_df = test_df[(test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)]
-        test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index().head()
+        test_df = test_df.drop(columns=metric_cols).groupby(hmm_cols[0:3]).sum(axis=1).reset_index()
     test_df.to_csv(output, sep="\t", index=False)
 
 
@@ -87,11 +87,14 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o
     kaiju_cols = df.filter(like="kaiju").columns.values.tolist()
     # for item in group_one:
     hmm_cols = df.filter(like=group_on[0].lower()).columns.values.tolist()
+    print(samples)
+    print(group_on[0])
     #TODO might not need
     test_df = df[kaiju_cols+hmm_cols+samples].copy()
     test_df = test_df[(df["kaiju_length"] > min_len) & (test_df[group_on[0].lower()+"_score"] > min_score) & (test_df[group_on[0].lower()+"_evalue"] < min_evalue)]
     test_df["kaiju_taxonomy"] = df["kaiju_taxonomy"].apply(lambda x: "; ".join(str(x).split("; ", split_idx)[0:split_idx]))
     metric_cols = test_df.filter(items=[group_on[0].lower()+"_score",group_on[0].lower()+"_evalue"]).columns.values.tolist()
+    print(metric_cols)
     metric_cols.append("kaiju_length")
     test_df = test_df.drop(columns=metric_cols).groupby(["kaiju_taxonomy"]+hmm_cols[0:3]).sum(axis=1).reset_index()
     test_df = test_df.rename(
@@ -102,16 +105,17 @@ def combined_output(df,tax_level,samples,min_evalue,min_score,min_len,group_on,o
 
 def main(output, table, tax_level, min_evalue, min_score, min_len,group_on):
     df = pd.read_table(table,low_memory=False)
-    # remove the rows w/ no information
-    df = df.dropna(
-    subset=[
-        "kaiju_taxonomy", "tigrfams_ec", "tigrfams_gene", "tigrfams_product",
+    col_names = [
+        "seq","kaiju_length","kaiju_taxonomy", "tigrfams_ec", "tigrfams_gene", "tigrfams_product",
         "tigrfams_score", "tigrfams_evalue", "hamap_ec", "hamap_gene",
         "hamap_product", "hamap_score", "hamap_evalue", "dbcan_ec",
-        "dbcan_enzyme_class", "dbcan_enzyme_class_subfamily"
-    ],
-    thresh=1)
-    samples = df.filter(regex="[0-9]+").columns.values.tolist()
+        "dbcan_enzyme_class", "dbcan_enzyme_class_subfamily","dbcan_score", "dbcan_evalue"
+    ]
+    # remove the rows w/ no information
+    # df = df.dropna(
+    # subset= col_names,
+    # thresh=2)
+    samples = [c for c in df.columns.values.tolist() if c not in col_names]
     if len(group_on) > 1:
         combined_output(df,tax_level,samples,min_evalue, min_score, min_len, group_on,output)
     else:

From 17ff8e755926d3aa1b33422b4b736ac791e57e84 Mon Sep 17 00:00:00 2001
From: nzavoshy <nzavoshy@uoregon.edu>
Date: Thu, 6 Dec 2018 14:07:48 -0800
Subject: [PATCH 14/14] fixes assigned columns

---
 scripts/build_report.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/build_report.py b/scripts/build_report.py
index f5d4fb2..bb45b17 100644
--- a/scripts/build_report.py
+++ b/scripts/build_report.py
@@ -159,7 +159,7 @@ def parse_files_for_annotation(path):
             try:
                 if toks[2]:
                     for i,sample in enumerate(header[18:]):
-                        if str(toks[18+i]) != 0:
+                        if int(toks[18+i]) != 0:
                             summary_counter[sample].update(["Kaiju+NR"])
                     #summary_counter.update()
                     # if not all(s=='' for s in toks[3:12]):
@@ -172,7 +172,7 @@ def parse_files_for_annotation(path):
                 continue
             if not all(s=='' for s in toks[4:12]):
                 for i,sample in enumerate(header[18:]):
-                    if str(toks[18+i]) != 0:
+                    if int(toks[18+i]) != 0:
                         summary_counter[sample].update(["TIGRFAMs,HAMAP,dbCAN\n hits per sample"])
     return summary_counter
 
@@ -279,12 +279,14 @@ def parse_log_files(merge_logs, unique_logs, clean_logs, classifications_per_sam
     print(log_df.shape)
     print(samp_df.shape)
     log_df.columns = header
+    print(log_df.head())
     # print("classifications", classifications_per_sample.head())
     log_df = log_df.merge(samp_df, left_index=True, right_index=True)
+    print(log_df.head())
     # print(log_df.head())
     log_df.reset_index(inplace=True)
     header.insert(0, "Sample")
-    header.extend(["Assigned\nFunction", "Assigned\nTaxonomy", "Assigned\nBoth"])
+    header.extend(["TIGRFAMs,HAMAP,dbCAN\n hits per sample", "Kaiju+NR"])
     log_df.columns = header
     sample_summary_table = log_df[header].to_html(
         index=False,
@@ -533,7 +535,7 @@ def main(
 ------
 
 Annotations Table
-*********************
+*****************
 
 Table containing annotations across all samples:
 
@@ -556,7 +558,6 @@ def main(
     hamap_product
     hamap_score
     hamap_evalue
-
     <hmm>_ec       The length of the DIAMOND blastx hit
     <hmm>_gene             The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency
     ec                         Enzyme Commission number from KEGG; semicolon delimited where multiple