Skip to content

Commit 68dd32c

Browse files
authored
fix: Views operations (#109)
* fix: Case sensitive info fields * fix: Views operations
1 parent fa80b85 commit 68dd32c

File tree

6 files changed

+51
-21
lines changed

6 files changed

+51
-21
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "polars_bio"
3-
version = "0.7.3"
3+
version = "0.7.4"
44
edition = "2021"
55

66
[lib]
@@ -36,7 +36,7 @@ polars-python = { git = "https://github.com/mwiewior/polars.git" , rev = "9d4fca
3636

3737
#exon ="0.32.4"
3838
exon = { git = "https://github.com/mwiewior/exon.git", rev="d134d923e6c592a9972d93215a12c759c70a7ed5"}
39-
datafusion-vcf = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git/", rev = "4d7e72b4588025d3a020b1a55a16f6484ad846d9"}
39+
datafusion-vcf = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git/", rev = "cc38b4ac3fe2d252245200422930fa879e92b92d"}
4040
async-trait = "0.1.86"
4141
futures = "0.3.31"
4242
coitrees = "0.4.0"

polars_bio/io.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,4 +378,4 @@ def from_polars(name: str, df: Union[pl.DataFrame, pl.LazyFrame]) -> None:
378378
def _cleanse_infos(t: Union[list[str], None]) -> Union[list[str], None]:
379379
if t is None:
380380
return None
381-
return [x.upper().strip() for x in t]
381+
return [x.strip() for x in t]

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "maturin"
44

55
[project]
66
name = "polars-bio"
7-
version = "0.7.3"
7+
version = "0.7.4"
88
description = "Blazing fast genomic operations on large Python dataframes"
99
authors = []
1010
requires-python = ">=3.9"

src/query.rs

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ pub(crate) fn nearest_query(query_params: QueryParams) -> String {
99
a.{} AS {}{}, -- pos_end
1010
b.{} AS {}{}, -- contig
1111
b.{} AS {}{}, -- pos_start
12-
b.{} AS {}{}, -- pos_end
13-
{}
12+
b.{} AS {}{} -- pos_end
1413
{}
14+
{},
1515
CAST(
1616
CASE WHEN b.{} >= a.{}
1717
THEN
@@ -52,16 +52,16 @@ pub(crate) fn nearest_query(query_params: QueryParams) -> String {
5252
"a".to_string(),
5353
query_params.suffixes.0.clone(),
5454
)
55-
+ ","
5655
} else {
5756
"".to_string()
5857
},
5958
if !query_params.other_columns_2.is_empty() {
60-
format_non_join_tables(
61-
query_params.other_columns_2.clone(),
62-
"b".to_string(),
63-
query_params.suffixes.1.clone(),
64-
)
59+
",".to_string()
60+
+ &format_non_join_tables(
61+
query_params.other_columns_2.clone(),
62+
"b".to_string(),
63+
query_params.suffixes.1.clone(),
64+
)
6565
} else {
6666
"".to_string()
6767
},
@@ -100,7 +100,7 @@ pub(crate) fn overlap_query(query_params: QueryParams) -> String {
100100
{}
101101
{}
102102
FROM
103-
{} a, {} b
103+
{} AS a, {} AS b
104104
WHERE
105105
a.{}=b.{}
106106
AND
@@ -133,16 +133,16 @@ pub(crate) fn overlap_query(query_params: QueryParams) -> String {
133133
"a".to_string(),
134134
query_params.suffixes.0.clone(),
135135
)
136-
+ ","
137136
} else {
138137
"".to_string()
139138
},
140139
if !query_params.other_columns_2.is_empty() {
141-
format_non_join_tables(
142-
query_params.other_columns_2.clone(),
143-
"b".to_string(),
144-
query_params.suffixes.1.clone(),
145-
)
140+
",".to_string()
141+
+ &format_non_join_tables(
142+
query_params.other_columns_2.clone(),
143+
"b".to_string(),
144+
query_params.suffixes.1.clone(),
145+
)
146146
} else {
147147
"".to_string()
148148
},

tests/test_io.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,36 @@ def test_fields(self):
1717
assert self.df["cigar"][4] == "101M"
1818

1919

20+
class TestIOVCFInfo:
21+
vcf_big = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz"
22+
vcf_infos_mixed_cases = (
23+
pb.read_vcf(vcf_big, info_fields=["AF", "vep"], thread_num=1).limit(1).collect()
24+
)
25+
26+
def test_count(self):
27+
assert len(self.vcf_infos_mixed_cases) == 1
28+
29+
30+
class TestVCFViewsOperations:
31+
def test_view(self):
32+
vcf_big = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz"
33+
pb.register_vcf(vcf_big, "gnomad_big", info_fields=["AF", "vep"], thread_num=1)
34+
pb.register_view(
35+
"v_gnomad_big",
36+
"SELECT chrom, start, end, split_part(vep, '|', 3) AS impact from gnomad_big where array_element(af,1)=0 and split_part(vep, '|', 3) in ('HIGH', 'MODERATE') limit 10",
37+
)
38+
vcf_sv = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
39+
pb.register_vcf(
40+
vcf_sv, "gnomad_sv", thread_num=1, info_fields=["SVTYPE", "SVLEN"]
41+
)
42+
pb.register_view(
43+
"v_gnomad_sv", "SELECT chrom, start, end FROM gnomad_sv limit 100"
44+
)
45+
assert len(pb.sql("SELECT * FROM v_gnomad_big").collect()) == 10
46+
assert len(pb.nearest("v_gnomad_sv", "v_gnomad_big").collect()) == 100
47+
assert len(pb.overlap("v_gnomad_sv", "v_gnomad_big").collect()) == 43
48+
49+
2050
class TestIOVCF:
2151
df_bgz = pb.read_vcf(f"{DATA_DIR}/io/vcf/vep.vcf.bgz").collect()
2252
df_none = pb.read_vcf(f"{DATA_DIR}/io/vcf/vep.vcf").collect()

0 commit comments

Comments
 (0)