Skip to content

Commit 7e56a9c

Browse files
committed
Refactored into FuncFlows
1 parent 9d5131c commit 7e56a9c

File tree

1 file changed

+37
-26
lines changed

1 file changed

+37
-26
lines changed

src/gff_example.py

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import List, Optional
33

44
from unifair import runtime
5+
from unifair.compute.flow import FuncFlowTemplate
56
from unifair.data.dataset import Dataset
67
from unifair.data.model import Model
78
from unifair.modules.general.tasks import import_directory, split_dataset
@@ -17,6 +18,7 @@
1718
from unifair.modules.tables.models import JsonTableOfStrings
1819

1920
runtime.config.engine = 'local'
21+
runtime.config.prefect.use_cached_results = False
2022

2123
# Tasks
2224

@@ -62,36 +64,45 @@ def attrib_df_names(dataset: Dataset[Model[object]]) -> List[str]:
6264

6365

6466
# Flow
67+
@FuncFlowTemplate
68+
def convert_gff_files(data: Dataset[Model[str]]) -> PandasDataset:
69+
data_2 = slice_lines(data, start=0, end=1000)
6570

66-
data_1 = import_directory.run('input/gff', suffix='.gff', model=Model[str])
67-
data_2 = slice_lines.run(data_1) # , start=0, end=1000)
71+
pd_data_3 = from_csv(data_2, delimiter='\t', first_row_as_col_names=False, col_names=GFF_COLS)
72+
pd_data_4 = extract_columns_as_files(pd_data_3, [ATTRIB_COL])
73+
pd_data_5_main, pd_data_3_attrib = split_dataset(pd_data_4, attrib_df_names(pd_data_4))
6874

69-
pd_data_3 = from_csv.run(data_2, delimiter='\t', first_row_as_col_names=False, col_names=GFF_COLS)
70-
pd_data_4 = extract_columns_as_files.run(pd_data_3, [ATTRIB_COL])
71-
pd_data_5_main, pd_data_3_attrib = split_dataset.run(pd_data_4, attrib_df_names(pd_data_4))
75+
data_6_attrib = to_csv(pd_data_3_attrib, first_row_as_col_names=False)
76+
data_7_attrib = transform_all_lines_to_json(data_6_attrib)
77+
data_8_attrib = transform_datafile_start_and_end_to_json(data_7_attrib)
7278

73-
data_6_attrib = to_csv.run(pd_data_3_attrib, first_row_as_col_names=False)
74-
data_7_attrib = transform_all_lines_to_json.run(data_6_attrib)
75-
data_8_attrib = transform_datafile_start_and_end_to_json.run(data_7_attrib)
79+
data_9_attrib = Dataset[JsonTableOfStrings]()
80+
data_9_attrib.from_json(data_8_attrib.to_data())
7681

77-
data_9_attrib = Dataset[JsonTableOfStrings]()
78-
data_9_attrib.from_json(data_8_attrib.to_data())
82+
pd_data_7_attrib = PandasDataset()
83+
pd_data_7_attrib.from_data(data_9_attrib.to_data())
7984

80-
pd_data_7_attrib = PandasDataset()
81-
pd_data_7_attrib.from_data(data_9_attrib.to_data())
85+
pd_data_10 = concat_dataframes_across_datasets(
86+
[pd_data_5_main, pd_data_7_attrib],
87+
vertical=False,
88+
)
89+
90+
serialize_to_tarpacked_raw_files('1_data', data)
91+
serialize_to_tarpacked_raw_files('2_data', data_2)
92+
serialize_to_tarpacked_csv_files('3_pd_data', pd_data_3)
93+
serialize_to_tarpacked_csv_files('4_pd_data', pd_data_4)
94+
serialize_to_tarpacked_csv_files('5_pd_data_main', pd_data_5_main)
95+
serialize_to_tarpacked_raw_files('6_raw_data_attributes', data_6_attrib)
96+
serialize_to_tarpacked_raw_files('7_raw_data_attributes', data_7_attrib)
97+
serialize_to_tarpacked_raw_files('8_raw_data_attributes', data_8_attrib)
98+
serialize_to_tarpacked_json_files('9_json_data_attributes', data_9_attrib)
99+
serialize_to_tarpacked_csv_files('10_pd_data', pd_data_10)
100+
101+
102+
@FuncFlowTemplate
103+
def import_gff_and_convert_to_pandas() -> PandasDataset:
104+
data: Dataset[Model[str]] = import_directory('input/gff', suffix='.gff', model=Model[str])
105+
return convert_gff_files(data)
82106

83-
pd_data_10 = concat_dataframes_across_datasets.run(
84-
[pd_data_5_main, pd_data_7_attrib],
85-
vertical=False,
86-
)
87107

88-
serialize_to_tarpacked_raw_files('1_data', data_1)
89-
serialize_to_tarpacked_raw_files('2_data', data_2)
90-
serialize_to_tarpacked_csv_files('3_pd_data', pd_data_3)
91-
serialize_to_tarpacked_csv_files('4_pd_data', pd_data_4)
92-
serialize_to_tarpacked_csv_files('5_pd_data_main', pd_data_5_main)
93-
serialize_to_tarpacked_raw_files('6_raw_data_attributes', data_6_attrib)
94-
serialize_to_tarpacked_raw_files('7_raw_data_attributes', data_7_attrib)
95-
serialize_to_tarpacked_raw_files('8_raw_data_attributes', data_8_attrib)
96-
serialize_to_tarpacked_json_files('9_json_data_attributes', data_9_attrib)
97-
serialize_to_tarpacked_csv_files('10_pd_data', pd_data_10)
108+
import_gff_and_convert_to_pandas.run()

0 commit comments

Comments
 (0)