|
2 | 2 | from typing import List, Optional |
3 | 3 |
|
4 | 4 | from unifair import runtime |
| 5 | +from unifair.compute.flow import FuncFlowTemplate |
5 | 6 | from unifair.data.dataset import Dataset |
6 | 7 | from unifair.data.model import Model |
7 | 8 | from unifair.modules.general.tasks import import_directory, split_dataset |
|
17 | 18 | from unifair.modules.tables.models import JsonTableOfStrings |
18 | 19 |
|
19 | 20 | runtime.config.engine = 'local' |
| 21 | +runtime.config.prefect.use_cached_results = False |
20 | 22 |
|
21 | 23 | # Tasks |
22 | 24 |
|
@@ -62,36 +64,45 @@ def attrib_df_names(dataset: Dataset[Model[object]]) -> List[str]: |
62 | 64 |
|
63 | 65 |
|
64 | 66 | # Flow |
| 67 | +@FuncFlowTemplate |
| 68 | +def convert_gff_files(data: Dataset[Model[str]]) -> PandasDataset: |
| 69 | + data_2 = slice_lines(data, start=0, end=1000) |
65 | 70 |
|
66 | | -data_1 = import_directory.run('input/gff', suffix='.gff', model=Model[str]) |
67 | | -data_2 = slice_lines.run(data_1) # , start=0, end=1000) |
| 71 | + pd_data_3 = from_csv(data_2, delimiter='\t', first_row_as_col_names=False, col_names=GFF_COLS) |
| 72 | + pd_data_4 = extract_columns_as_files(pd_data_3, [ATTRIB_COL]) |
| 73 | + pd_data_5_main, pd_data_3_attrib = split_dataset(pd_data_4, attrib_df_names(pd_data_4)) |
68 | 74 |
|
69 | | -pd_data_3 = from_csv.run(data_2, delimiter='\t', first_row_as_col_names=False, col_names=GFF_COLS) |
70 | | -pd_data_4 = extract_columns_as_files.run(pd_data_3, [ATTRIB_COL]) |
71 | | -pd_data_5_main, pd_data_3_attrib = split_dataset.run(pd_data_4, attrib_df_names(pd_data_4)) |
| 75 | + data_6_attrib = to_csv(pd_data_3_attrib, first_row_as_col_names=False) |
| 76 | + data_7_attrib = transform_all_lines_to_json(data_6_attrib) |
| 77 | + data_8_attrib = transform_datafile_start_and_end_to_json(data_7_attrib) |
72 | 78 |
|
73 | | -data_6_attrib = to_csv.run(pd_data_3_attrib, first_row_as_col_names=False) |
74 | | -data_7_attrib = transform_all_lines_to_json.run(data_6_attrib) |
75 | | -data_8_attrib = transform_datafile_start_and_end_to_json.run(data_7_attrib) |
| 79 | + data_9_attrib = Dataset[JsonTableOfStrings]() |
| 80 | + data_9_attrib.from_json(data_8_attrib.to_data()) |
76 | 81 |
|
77 | | -data_9_attrib = Dataset[JsonTableOfStrings]() |
78 | | -data_9_attrib.from_json(data_8_attrib.to_data()) |
| 82 | + pd_data_7_attrib = PandasDataset() |
| 83 | + pd_data_7_attrib.from_data(data_9_attrib.to_data()) |
79 | 84 |
|
80 | | -pd_data_7_attrib = PandasDataset() |
81 | | -pd_data_7_attrib.from_data(data_9_attrib.to_data()) |
| 85 | + pd_data_10 = concat_dataframes_across_datasets( |
| 86 | + [pd_data_5_main, pd_data_7_attrib], |
| 87 | + vertical=False, |
| 88 | + ) |
| 89 | + |
| 90 | + serialize_to_tarpacked_raw_files('1_data', data) |
| 91 | + serialize_to_tarpacked_raw_files('2_data', data_2) |
| 92 | + serialize_to_tarpacked_csv_files('3_pd_data', pd_data_3) |
| 93 | + serialize_to_tarpacked_csv_files('4_pd_data', pd_data_4) |
| 94 | + serialize_to_tarpacked_csv_files('5_pd_data_main', pd_data_5_main) |
| 95 | + serialize_to_tarpacked_raw_files('6_raw_data_attributes', data_6_attrib) |
| 96 | + serialize_to_tarpacked_raw_files('7_raw_data_attributes', data_7_attrib) |
| 97 | + serialize_to_tarpacked_raw_files('8_raw_data_attributes', data_8_attrib) |
| 98 | + serialize_to_tarpacked_json_files('9_json_data_attributes', data_9_attrib) |
| 99 | + serialize_to_tarpacked_csv_files('10_pd_data', pd_data_10) |
| 100 | + |
| 101 | + |
| 102 | +@FuncFlowTemplate |
| 103 | +def import_gff_and_convert_to_pandas() -> PandasDataset: |
| 104 | + data: Dataset[Model[str]] = import_directory('input/gff', suffix='.gff', model=Model[str]) |
| 105 | + return convert_gff_files(data) |
82 | 106 |
|
83 | | -pd_data_10 = concat_dataframes_across_datasets.run( |
84 | | - [pd_data_5_main, pd_data_7_attrib], |
85 | | - vertical=False, |
86 | | -) |
87 | 107 |
|
88 | | -serialize_to_tarpacked_raw_files('1_data', data_1) |
89 | | -serialize_to_tarpacked_raw_files('2_data', data_2) |
90 | | -serialize_to_tarpacked_csv_files('3_pd_data', pd_data_3) |
91 | | -serialize_to_tarpacked_csv_files('4_pd_data', pd_data_4) |
92 | | -serialize_to_tarpacked_csv_files('5_pd_data_main', pd_data_5_main) |
93 | | -serialize_to_tarpacked_raw_files('6_raw_data_attributes', data_6_attrib) |
94 | | -serialize_to_tarpacked_raw_files('7_raw_data_attributes', data_7_attrib) |
95 | | -serialize_to_tarpacked_raw_files('8_raw_data_attributes', data_8_attrib) |
96 | | -serialize_to_tarpacked_json_files('9_json_data_attributes', data_9_attrib) |
97 | | -serialize_to_tarpacked_csv_files('10_pd_data', pd_data_10) |
| 108 | +import_gff_and_convert_to_pandas.run() |
0 commit comments