11use std:: fs:: File ;
22use std:: io:: Error ;
33use std:: num:: NonZero ;
4+ use std:: sync:: Arc ;
45use async_stream:: stream;
56use bytes:: Bytes ;
7+ use datafusion:: arrow;
8+ use datafusion:: arrow:: array:: StringBuilder ;
9+ use datafusion:: arrow:: datatypes:: SchemaRef ;
10+ use datafusion:: arrow:: ipc:: RecordBatch ;
11+ use datafusion:: datasource:: MemTable ;
612use futures:: { stream, StreamExt } ;
713use futures:: stream:: BoxStream ;
814use log:: debug;
915use noodles:: { bgzf, vcf} ;
1016use noodles:: vcf:: io:: Reader ;
11- use noodles:: vcf:: Record ;
17+ use noodles:: vcf:: { Header , Record } ;
1218use noodles_bgzf:: { AsyncReader , MultithreadedReader } ;
1319use opendal:: { FuturesBytesStream , Operator } ;
1420use opendal:: layers:: { LoggingLayer , RetryLayer , TimeoutLayer } ;
@@ -271,6 +277,20 @@ impl VcfRemoteReader {
271277 }
272278 }
273279 }
280+ pub async fn describe ( & mut self ) -> Result < arrow:: array:: RecordBatch , Error > {
281+ match self {
282+ VcfRemoteReader :: BGZF ( reader) => {
283+ let header = reader. read_header ( ) . await ?;
284+ Ok ( get_info_fields ( & header) . await )
285+ }
286+ VcfRemoteReader :: PLAIN ( reader) => {
287+ let header = reader. read_header ( ) . await ?;
288+ Ok ( get_info_fields ( & header) . await )
289+ }
290+ }
291+ }
292+
293+
274294
275295 pub async fn read_records ( & mut self ) -> BoxStream < ' _ , Result < Record , Error > > {
276296 match self {
@@ -324,4 +344,94 @@ impl VcfLocalReader {
324344 }
325345 }
326346 }
347+ pub async fn describe ( & mut self ) -> Result < arrow:: array:: RecordBatch , Error > {
348+ match self {
349+ VcfLocalReader :: BGZF ( reader) => {
350+ let header = reader. read_header ( ) ?;
351+ Ok ( get_info_fields ( & header) . await )
352+ }
353+ VcfLocalReader :: PLAIN ( reader) => {
354+ let header = reader. read_header ( ) . await ?;
355+ Ok ( get_info_fields ( & header) . await )
356+ }
357+ }
358+ }
359+ }
360+
361+ pub async fn get_info_fields ( header : & Header ) -> arrow:: array:: RecordBatch {
362+ let info_fields = header. infos ( ) ;
363+ let mut field_names = StringBuilder :: new ( ) ;
364+ let mut field_types = StringBuilder :: new ( ) ;
365+ let mut field_descriptions = StringBuilder :: new ( ) ;
366+ for ( field_name, field) in info_fields {
367+ field_names. append_value ( field_name. to_lowercase ( ) ) ;
368+ field_types. append_value ( field. ty ( ) . to_string ( ) ) ;
369+ field_descriptions. append_value ( field. description ( ) ) ;
370+ }
371+ // build RecordBatch
372+ let field_names = field_names. finish ( ) ;
373+ let field_types = field_types. finish ( ) ;
374+ let field_descriptions = field_descriptions. finish ( ) ;
375+ let schema = arrow:: datatypes:: Schema :: new ( vec ! [
376+ arrow:: datatypes:: Field :: new( "name" , arrow:: datatypes:: DataType :: Utf8 , false ) ,
377+ arrow:: datatypes:: Field :: new( "type" , arrow:: datatypes:: DataType :: Utf8 , false ) ,
378+ arrow:: datatypes:: Field :: new( "description" , arrow:: datatypes:: DataType :: Utf8 , false ) ,
379+ ] ) ;
380+ let record_batch = arrow:: record_batch:: RecordBatch :: try_new (
381+ SchemaRef :: from ( schema. clone ( ) ) ,
382+ vec ! [ Arc :: new( field_names) , Arc :: new( field_types) , Arc :: new( field_descriptions) ]
383+ ) . unwrap ( ) ;
384+ record_batch
385+ }
386+
387+ pub enum VcfReader {
388+ Local ( VcfLocalReader ) ,
389+ Remote ( VcfRemoteReader )
390+ }
391+
392+ impl VcfReader {
393+
394+ pub async fn new ( file_path : String , thread_num : Option < usize > , chunk_size : Option < usize > , concurrency_fetches : Option < usize > ) -> Self {
395+ let storage_type = get_storage_type ( file_path. clone ( ) ) ;
396+ match storage_type {
397+ StorageType :: LOCAL => {
398+ VcfReader :: Local ( VcfLocalReader :: new ( file_path, thread_num. unwrap_or ( 1 ) ) . await )
399+ }
400+ _ => {
401+ VcfReader :: Remote ( VcfRemoteReader :: new ( file_path, chunk_size. unwrap_or ( 64 ) , concurrency_fetches. unwrap_or ( 8 ) ) . await )
402+ }
403+ }
404+ }
405+
406+ pub async fn read_header ( & mut self ) -> Result < vcf:: Header , Error > {
407+ match self {
408+ VcfReader :: Local ( reader) => {
409+ reader. read_header ( ) . await
410+ }
411+ VcfReader :: Remote ( reader) => {
412+ reader. read_header ( ) . await
413+ }
414+ }
415+ }
416+ pub async fn describe ( & mut self ) -> Result < arrow:: array:: RecordBatch , Error > {
417+ match self {
418+ VcfReader :: Local ( reader) => {
419+ reader. describe ( ) . await
420+ }
421+ VcfReader :: Remote ( reader) => {
422+ reader. describe ( ) . await
423+ }
424+ }
425+ }
426+ pub async fn read_records ( & mut self ) -> BoxStream < ' _ , Result < Record , Error > > {
427+ match self {
428+ VcfReader :: Local ( reader) => {
429+ reader. read_records ( )
430+ }
431+ VcfReader :: Remote ( reader) => {
432+ reader. read_records ( ) . await
433+ }
434+ }
435+ }
436+
327437}
0 commit comments