Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/dataset/file_csv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ std::shared_ptr<FileWriteOptions> CsvFileFormat::DefaultWriteOptions() {
new CsvFileWriteOptions(shared_from_this()));
csv_options->write_options =
std::make_shared<csv::WriteOptions>(csv::WriteOptions::Defaults());
csv_options->write_options->delimiter = parse_options.delimiter;
return csv_options;
}

Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/_dataset.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2253,6 +2253,8 @@ cdef class CsvFileFormat(FileFormat):
"""
cdef CsvFileWriteOptions opts = \
<CsvFileWriteOptions> FileFormat.make_write_options(self)
if 'delimiter' not in kwargs:
Comment thread
egolearner marked this conversation as resolved.
Outdated
kwargs['delimiter'] = opts.write_options.delimiter
opts.write_options = WriteOptions(**kwargs)
return opts

Expand Down
46 changes: 46 additions & 0 deletions python/pyarrow/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2149,3 +2149,49 @@ def test_write_csv_empty_batch_should_not_pollute_output(tables, expected):
result = buf.read()

assert result == expected


def test_write_csv_custom_delimiter_from_parse_options():
import pyarrow.dataset as ds

table = pa.table({
"B": ["B1", "B2"],
"C": ["C1", "C2"],
})

# Verify that CsvFileFormat.make_write_options propagates the
# parse delimiter to write options when no explicit delimiter is given
for delimiter in [">", "|", "\t", ";"]:
csv_format = ds.CsvFileFormat(pa.csv.ParseOptions(delimiter=delimiter))
write_opts = csv_format.make_write_options()
assert write_opts.write_options.delimiter == delimiter

# Verify that an explicitly passed delimiter takes precedence
csv_format = ds.CsvFileFormat(pa.csv.ParseOptions(delimiter=">"))
write_opts = csv_format.make_write_options(delimiter="|")
assert write_opts.write_options.delimiter == "|"

# Verify the default delimiter is still "," when no parse options are given
csv_format = ds.CsvFileFormat()
write_opts = csv_format.make_write_options()
assert write_opts.write_options.delimiter == ","

# Verify end-to-end: write_dataset with custom delimiter produces
# output using that delimiter, not the default ","
with tempfile.TemporaryDirectory() as tmpdir:
csv_format = ds.CsvFileFormat(pa.csv.ParseOptions(delimiter=">"))
ds.write_dataset(table, tmpdir, format=csv_format)

# Check that written CSV files use the custom delimiter
for root, dirs, files in os.walk(tmpdir):
for f in files:
with open(os.path.join(root, f)) as fh:
content = fh.read()
assert ">" in content, (
f"Expected '>' delimiter in CSV output, got: {content!r}"
)

# Read back and verify roundtrip
read_format = ds.CsvFileFormat(pa.csv.ParseOptions(delimiter=">"))
result = ds.dataset(tmpdir, format=read_format).to_table()
assert result.equals(table)
Loading