Skip to content

Commit 0458d30

Browse files
authored
fix: CSV Infer Schema now properly supports escaped characters. (apache#13214)
1 parent 659d5a3 commit 0458d30

File tree

1 file changed

+55
-1
lines changed
  • datafusion/core/src/datasource/file_format

1 file changed

+55
-1
lines changed

datafusion/core/src/datasource/file_format/csv.rs

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,12 @@ impl CsvFormat {
454454
.has_header
455455
.unwrap_or(state.config_options().catalog.has_header),
456456
)
457-
.with_delimiter(self.options.delimiter);
457+
.with_delimiter(self.options.delimiter)
458+
.with_quote(self.options.quote);
459+
460+
if let Some(escape) = self.options.escape {
461+
format = format.with_escape(escape);
462+
}
458463

459464
if let Some(comment) = self.options.comment {
460465
format = format.with_comment(comment);
@@ -867,6 +872,55 @@ mod tests {
867872
Ok(())
868873
}
869874

875+
#[tokio::test]
876+
async fn test_infer_schema_escape_chars() -> Result<()> {
877+
let session_ctx = SessionContext::new();
878+
let state = session_ctx.state();
879+
let variable_object_store = Arc::new(VariableStream::new(
880+
Bytes::from(
881+
r#"c1,c2,c3,c4
882+
0.3,"Here, is a comma\"",third,3
883+
0.31,"double quotes are ok, "" quote",third again,9
884+
0.314,abc,xyz,27"#,
885+
),
886+
1,
887+
));
888+
let object_meta = ObjectMeta {
889+
location: Path::parse("/")?,
890+
last_modified: DateTime::default(),
891+
size: usize::MAX,
892+
e_tag: None,
893+
version: None,
894+
};
895+
896+
let num_rows_to_read = 3;
897+
let csv_format = CsvFormat::default()
898+
.with_has_header(true)
899+
.with_schema_infer_max_rec(num_rows_to_read)
900+
.with_quote(b'"')
901+
.with_escape(Some(b'\\'));
902+
903+
let inferred_schema = csv_format
904+
.infer_schema(
905+
&state,
906+
&(variable_object_store.clone() as Arc<dyn ObjectStore>),
907+
&[object_meta],
908+
)
909+
.await?;
910+
911+
let actual_fields: Vec<_> = inferred_schema
912+
.fields()
913+
.iter()
914+
.map(|f| format!("{}: {:?}", f.name(), f.data_type()))
915+
.collect();
916+
917+
assert_eq!(
918+
vec!["c1: Float64", "c2: Utf8", "c3: Utf8", "c4: Int64",],
919+
actual_fields
920+
);
921+
Ok(())
922+
}
923+
870924
#[rstest(
871925
file_compression_type,
872926
case(FileCompressionType::UNCOMPRESSED),

0 commit comments

Comments
 (0)