Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
1c39343
Add baseline implementation of base quality score
KarolZebala May 24, 2025
e951834
add udf functions.
KarolZebala Jun 7, 2025
e087df5
test dla quality_op; cacl_base_seq_quality
Jun 12, 2025
67228e0
exeptions tests addup
Jun 12, 2025
b09e653
Jupyter v1
Jun 13, 2025
5b77234
Jupyterv2
Jun 13, 2025
a8dcf04
add udf for stats
KarolZebala Jun 13, 2025
54dfd60
V2 testy oraz Jupiter
Jun 14, 2025
b5aa333
fixups
Jun 14, 2025
b053d9d
fixups v2
Jun 14, 2025
a61a461
Jupiter fixup
Jun 14, 2025
3fb48f0
liczba wierszy w tescie zwiekszona do 6
Jun 14, 2025
a3f2487
test paths fixup
Jun 14, 2025
5d01d85
V1 tests passed
Jun 14, 2025
0d01545
test_quality_op: EXPECTED_ROWS adjusted
piotrwalczak8 Jun 14, 2025
c5d7b9a
Merge branch 'master' of https://github.com/KarolZebala/polars-bio-z7
piotrwalczak8 Jun 14, 2025
1dfc91f
initial html generator added
piotrwalczak8 Jun 14, 2025
7ff5140
objasnienie zakodowania
Jun 14, 2025
ea9bd25
instrukcja fixup
Jun 14, 2025
2bae831
report generator adjustments 1
piotrwalczak8 Jun 14, 2025
5da1a78
added generated html report
piotrwalczak8 Jun 14, 2025
4495fb0
tests v2
Jun 14, 2025
0867f1a
Merge branch 'master' of https://github.com/KarolZebala/polars-bio-z7
Jun 14, 2025
4fa9ab8
duration test fixup
Jun 14, 2025
c9e1145
Functions comparisson test
piotrwalczak8 Jun 14, 2025
151301b
Final jupyter V1 push
Jun 15, 2025
ae405b5
fixes to notebook
KarolZebala Jun 15, 2025
a5f7c56
adjusted path for test 3
piotrwalczak8 Jun 15, 2025
8c835f2
remove old file
KarolZebala Jun 15, 2025
b1b5f24
adjusted report generator and added report
piotrwalczak8 Jun 15, 2025
fe32012
Merge branch 'master' of https://github.com/KarolZebala/polars-bio-z7
piotrwalczak8 Jun 15, 2025
20513ed
fix html generating
KarolZebala Jun 15, 2025
ec8ca77
final fix to notbook
KarolZebala Jun 15, 2025
ed41178
fix typo
KarolZebala Jun 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ datafusion-python = { git = "https://github.com/apache/datafusion-python.git", r
pyo3 = { version = "0.22"}
pyo3-log = "0.11.0"
sequila-core = { git = "https://github.com/biodatageeks/sequila-native.git", rev = "43453dca320cc25a02f440850e3b947b813785c3" }
rayon = "1.8"

datafusion = { version = "43.0.0"}
arrow = "53.3.0"
Expand Down
21 changes: 21 additions & 0 deletions Zapis_genomow_decoded.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Jak to działa?

Na przykładzie tej sekwencji (poniżej cała jedna sekwencja):
```` py

@SRR9130495.1 D00236:723:HG32CBCX2:1:1108:1330:1935/1
NCAATACAAAAGCAATATGGGAGAAGCTACCTACCATGCTTAAAAACGCCAATGAGCAGNGATTTGTCANCNNNNNNNNCNNNNNNNNTNNTANNANNCTC
+
#4BDFDFFHGHGGJJJHIIIIGGIIJGJJGIIIIBHIJJJIIJIJJIJDHIGGGIJJJI#-@AEHGEFF#,########,########+##++##+##+2<

````

## Zgodnie z linijkami kolejno

1. '@identyfikator sekwencji' małpa symbolizuje początek rekordu --> dalej jest ciąg znaków a xxx.1 to liczba porządkowa
2. sekwencja genetyczna DNA czyli zapisane geny gdzie N - niepewny odczyt
3. '+' separator "jakości" czyli po tym są wyniki
4. jakość odczytu (Czyli ten Phred Score) dla każdego nukleotydu (genu) w postaci znaków ASCII

W tabeli wynikowej jeden wiersz odpowiada jednej sekwencji

12 changes: 12 additions & 0 deletions benchmark/src/base_seq_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import polars_bio as pb
import pandas

path = "example.fastq"

print('Odczytan zawartość pliku', path)
fastq = pb.read_fastq(path).collect().head()
print(fastq)

print('Base sequence quality dla pliku', path)
test = pb.cacl_base_seq_quality(path, target_partitions=4, output_type='pandas.DataFrame')
print(test.head())
136 changes: 136 additions & 0 deletions benchmark/src/base_seq_quality_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import polars_bio as pb


def main():
script_dir = Path(__file__).parent
fastq = script_dir / "example.fastq"
out_html = script_dir / "report_full.html"

if not fastq.exists():
print(f"Nie znaleziono pliku: {fastq}", file=sys.stderr)
sys.exit(1)
print(f"Parsuję FASTQ: {fastq}")

df = pb.cacl_base_seq_quality(str(fastq), output_type='pandas.DataFrame')
print(df.columns)
print(df[['min', 'q1', 'median', 'q3', 'max']].apply(len))
print(f"Policzono statystyki: {df.shape[0]} pozycji")

traces = []

for _, r in df.iterrows():
traces.append(go.Scatter(
x=[r["min"], r["max"]],
y=[r["position"], r["position"]],
mode="lines",
line=dict(color="gray", width=2),
showlegend=False,
hoverinfo="skip"
))

customdata = df[['min', 'q1', 'median', 'q3', 'max']].to_numpy()

traces.append(go.Bar(
x=df["q3"] - df["q1"],
y=df["position"],
base=df["q1"],
orientation="h",
marker_color="rgba(0,100,80,0.6)",
marker_line=dict(color="rgba(0,100,80,1)", width=2),
width=0.6,
showlegend=False,
customdata=customdata,
hovertemplate=(
"Position: %{y}<br>"
"Min: %{customdata[0]}<br>"
"Q1: %{customdata[1]}<br>"
"Median: %{customdata[2]}<br>"
"Q3: %{customdata[3]}<br>"
"Max: %{customdata[4]}<br>"
)
))

traces.append(go.Scatter(
x=df["median"],
y=df["position"],
mode="markers",
marker=dict(
color="white",
symbol="line-ns-open",
size=16,
line=dict(width=2)
),
showlegend=False,
hoverinfo="skip"
))

traces.append(go.Scatter(
x=df["median"],
y=df["position"],
mode="lines+markers",
line=dict(color="red", width=2),
marker=dict(size=4),
name="Median",
hoverinfo="skip"
))

height = max(600, df.shape[0] * 60 + 200)

fig = go.Figure(traces)
fig.update_yaxes(
autorange="reversed",
title="Position in read (bp)",
dtick=1,
range=[0, 100]
)
fig.update_xaxes(
title="Phred score",
dtick=2
)
fig.update_layout(
title="Phred Score per Base Position",
template="plotly_white",
height=height,
margin=dict(l=80, r=20, t=60, b=60)
)

plot_html = fig.to_html(full_html=False, include_plotlyjs='cdn')

table_html = df.to_html(classes="table table-striped", index=False)

html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>FastQC-like report</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.6.2/dist/css/bootstrap.min.css">
<style>
body {{ margin: 20px; }}
h1 {{ margin-bottom: 30px; }}
#plot {{ margin-bottom: 50px; }}
</style>
</head>
<body>
<h1>Phred Score per Base Position</h1>
<div id="plot">
{plot_html}
</div>
<h2>Statistics Table</h2>
<div id="table">
{table_html}
</div>
</body>
</html>
"""
out_html.write_text(html, encoding='utf-8')
print(f" Wygenerowano pełny raport: {out_html}")

if __name__ == "__main__":
main()
Loading