biodatageeks · KarolZebala · May 24, 2025 · Jun 7, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,6 +15,7 @@ datafusion-python = { git = "https://github.com/apache/datafusion-python.git", r
 pyo3 = { version = "0.22"}
 pyo3-log = "0.11.0"
 sequila-core = { git = "https://github.com/biodatageeks/sequila-native.git", rev = "43453dca320cc25a02f440850e3b947b813785c3" }
+rayon = "1.8"
 
 datafusion = { version = "43.0.0"}
 arrow = "53.3.0"

diff --git a/Zapis_genomow_decoded.md b/Zapis_genomow_decoded.md
@@ -0,0 +1,21 @@
+# Jak to działa?
+
+Na przykładzie tej sekwencji (poniżej cała jedna sekwencja):
+```` py
+
+@SRR9130495.1 D00236:723:HG32CBCX2:1:1108:1330:1935/1
+NCAATACAAAAGCAATATGGGAGAAGCTACCTACCATGCTTAAAAACGCCAATGAGCAGNGATTTGTCANCNNNNNNNNCNNNNNNNNTNNTANNANNCTC
++
+#4BDFDFFHGHGGJJJHIIIIGGIIJGJJGIIIIBHIJJJIIJIJJIJDHIGGGIJJJI#-@AEHGEFF#,########,########+##++##+##+2<
+
+````
+
+## Zgodnie z linijkami kolejno
+
+1. '@identyfikator sekwencji' małpa symbolizuje początek rekordu --> dalej jest ciąg znaków a xxx.1 to liczba porządkowa
+2. sekwencja genetyczna DNA czyli zapisane geny gdzie N - niepewny odczyt
+3. '+' separator "jakości" czyli po tym są wyniki
+4. jakość odczytu (Czyli ten Phred Score) dla każdego nukleotydu (genu) w postaci znaków ASCII
+
+W tabeli wynikowej jeden wiersz odpowiada jednej sekwencji 
+
diff --git a/benchmark/src/base_seq_quality.py b/benchmark/src/base_seq_quality.py
@@ -0,0 +1,12 @@
+import polars_bio as pb
+import pandas
+
+path = "example.fastq"
+
+print('Odczytan zawartość pliku', path)
+fastq = pb.read_fastq(path).collect().head()
+print(fastq)
+
+print('Base sequence quality dla pliku', path)
+test = pb.cacl_base_seq_quality(path, target_partitions=4, output_type='pandas.DataFrame')
+print(test.head())
diff --git a/benchmark/src/base_seq_quality_report.py b/benchmark/src/base_seq_quality_report.py
@@ -0,0 +1,136 @@
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import polars_bio as pb
+
+
+def main():
+    script_dir = Path(__file__).parent
+    fastq = script_dir / "example.fastq"
+    out_html = script_dir / "report_full.html"
+
+    if not fastq.exists():
+        print(f"Nie znaleziono pliku: {fastq}", file=sys.stderr)
+        sys.exit(1)
+    print(f"Parsuję FASTQ: {fastq}")
+
+    df = pb.cacl_base_seq_quality(str(fastq), output_type='pandas.DataFrame')
+    print(df.columns)
+    print(df[['min', 'q1', 'median', 'q3', 'max']].apply(len)) 
+    print(f"Policzono statystyki: {df.shape[0]} pozycji")
+
+    traces = []
+
+    for _, r in df.iterrows():
+        traces.append(go.Scatter(
+            x=[r["min"], r["max"]],
+            y=[r["position"], r["position"]],
+            mode="lines",
+            line=dict(color="gray", width=2),
+            showlegend=False,
+            hoverinfo="skip"
+        ))
+
+    customdata = df[['min', 'q1', 'median', 'q3', 'max']].to_numpy()
+
+    traces.append(go.Bar(
+        x=df["q3"] - df["q1"],
+        y=df["position"],
+        base=df["q1"],
+        orientation="h",
+        marker_color="rgba(0,100,80,0.6)",
+        marker_line=dict(color="rgba(0,100,80,1)", width=2),
+        width=0.6,
+        showlegend=False,
+        customdata=customdata,
+        hovertemplate=(
+            "Position: %{y}<br>"
+            "Min: %{customdata[0]}<br>"
+            "Q1: %{customdata[1]}<br>"
+            "Median: %{customdata[2]}<br>"
+            "Q3: %{customdata[3]}<br>"
+            "Max: %{customdata[4]}<br>"
+        )
+    ))
+
+    traces.append(go.Scatter(
+        x=df["median"],
+        y=df["position"],
+        mode="markers",
+        marker=dict(
+            color="white",
+            symbol="line-ns-open",
+            size=16,
+            line=dict(width=2)
+        ),
+        showlegend=False,
+        hoverinfo="skip"
+    ))
+
+    traces.append(go.Scatter(
+        x=df["median"],
+        y=df["position"],
+        mode="lines+markers",
+        line=dict(color="red", width=2),
+        marker=dict(size=4),
+        name="Median",
+        hoverinfo="skip"
+    ))
+
+    height = max(600, df.shape[0] * 60 + 200)
+
+    fig = go.Figure(traces)
+    fig.update_yaxes(
+        autorange="reversed",
+        title="Position in read (bp)",
+        dtick=1,
+        range=[0, 100]
+    )
+    fig.update_xaxes(
+        title="Phred score",
+        dtick=2
+    )
+    fig.update_layout(
+        title="Phred Score per Base Position",
+        template="plotly_white",
+        height=height,
+        margin=dict(l=80, r=20, t=60, b=60)
+    )
+
+    plot_html = fig.to_html(full_html=False, include_plotlyjs='cdn')
+
+    table_html = df.to_html(classes="table table-striped", index=False)
+
+    html = f"""
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>FastQC-like report</title>
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.6.2/dist/css/bootstrap.min.css">
+  <style>
+    body {{ margin: 20px; }}
+    h1 {{ margin-bottom: 30px; }}
+    #plot {{ margin-bottom: 50px; }}
+  </style>
+</head>
+<body>
+  <h1>Phred Score per Base Position</h1>
+  <div id="plot">
+    {plot_html}
+  </div>
+  <h2>Statistics Table</h2>
+  <div id="table">
+    {table_html}
+  </div>
+</body>
+</html>
+"""
+    out_html.write_text(html, encoding='utf-8')
+    print(f" Wygenerowano pełny raport: {out_html}")
+
+if __name__ == "__main__":
+    main()