2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
20 #include <arrow-glib/error.hpp>
21 #include <arrow-glib/file-system.hpp>
22 #include <arrow-glib/output-stream.hpp>
23 #include <arrow-glib/record-batch.hpp>
24 #include <arrow-glib/reader.hpp>
25 #include <arrow-glib/schema.hpp>
27 #include <arrow-dataset-glib/file-format.hpp>
32 * SECTION: file-format
33 * @section_id: file-format
34 * @title: File format classes
35 * @include: arrow-dataset-glib/arrow-dataset-glib.h
37 * #GADatasetFileWriteOptions is a class for options to write a file
40 * #GADatasetFileWriter is a class for writing a file of this format.
42 * #GADatasetFileFormat is a base class for file format classes.
44 * #GADatasetCSVFileFormat is a class for CSV file format.
46 * #GADatasetIPCFileFormat is a class for IPC file format.
48 * #GADatasetParquetFileFormat is a class for Parquet file format.
53 typedef struct GADatasetFileWriteOptionsPrivate_
{
54 std::shared_ptr
<arrow::dataset::FileWriteOptions
> options
;
55 } GADatasetFileWriteOptionsPrivate
;
61 G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriteOptions
,
62 gadataset_file_write_options
,
65 #define GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(obj) \
66 static_cast<GADatasetFileWriteOptionsPrivate *>( \
67 gadataset_file_write_options_get_instance_private( \
68 GADATASET_FILE_WRITE_OPTIONS(obj)))
71 gadataset_file_write_options_finalize(GObject
*object
)
73 auto priv
= GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object
);
74 priv
->options
.~shared_ptr();
75 G_OBJECT_CLASS(gadataset_file_write_options_parent_class
)->finalize(object
);
79 gadataset_file_write_options_set_property(GObject
*object
,
84 auto priv
= GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object
);
89 *static_cast<std::shared_ptr
<arrow::dataset::FileWriteOptions
> *>(
90 g_value_get_pointer(value
));
93 G_OBJECT_WARN_INVALID_PROPERTY_ID(object
, prop_id
, pspec
);
99 gadataset_file_write_options_init(GADatasetFileWriteOptions
*object
)
101 auto priv
= GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object
);
102 new(&priv
->options
) std::shared_ptr
<arrow::dataset::FileWriteOptions
>;
106 gadataset_file_write_options_class_init(GADatasetFileWriteOptionsClass
*klass
)
108 auto gobject_class
= G_OBJECT_CLASS(klass
);
110 gobject_class
->finalize
= gadataset_file_write_options_finalize
;
111 gobject_class
->set_property
= gadataset_file_write_options_set_property
;
114 spec
= g_param_spec_pointer("options",
117 "std::shared<arrow::dataset::FileWriteOptions> *",
118 static_cast<GParamFlags
>(G_PARAM_WRITABLE
|
119 G_PARAM_CONSTRUCT_ONLY
));
120 g_object_class_install_property(gobject_class
, PROP_OPTIONS
, spec
);
124 typedef struct GADatasetFileWriterPrivate_
{
125 std::shared_ptr
<arrow::dataset::FileWriter
> writer
;
126 } GADatasetFileWriterPrivate
;
132 G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriter
,
133 gadataset_file_writer
,
136 #define GADATASET_FILE_WRITER_GET_PRIVATE(obj) \
137 static_cast<GADatasetFileWriterPrivate *>( \
138 gadataset_file_writer_get_instance_private( \
139 GADATASET_FILE_WRITER(obj)))
142 gadataset_file_writer_finalize(GObject
*object
)
144 auto priv
= GADATASET_FILE_WRITER_GET_PRIVATE(object
);
145 priv
->writer
.~shared_ptr();
146 G_OBJECT_CLASS(gadataset_file_writer_parent_class
)->finalize(object
);
150 gadataset_file_writer_set_property(GObject
*object
,
155 auto priv
= GADATASET_FILE_WRITER_GET_PRIVATE(object
);
160 *static_cast<std::shared_ptr
<arrow::dataset::FileWriter
> *>(
161 g_value_get_pointer(value
));
164 G_OBJECT_WARN_INVALID_PROPERTY_ID(object
, prop_id
, pspec
);
170 gadataset_file_writer_init(GADatasetFileWriter
*object
)
172 auto priv
= GADATASET_FILE_WRITER_GET_PRIVATE(object
);
173 new(&(priv
->writer
)) std::shared_ptr
<arrow::dataset::FileWriter
>;
177 gadataset_file_writer_class_init(GADatasetFileWriterClass
*klass
)
179 auto gobject_class
= G_OBJECT_CLASS(klass
);
181 gobject_class
->finalize
= gadataset_file_writer_finalize
;
182 gobject_class
->set_property
= gadataset_file_writer_set_property
;
185 spec
= g_param_spec_pointer("writer",
188 "std::shared<arrow::dataset::FileWriter> *",
189 static_cast<GParamFlags
>(G_PARAM_WRITABLE
|
190 G_PARAM_CONSTRUCT_ONLY
));
191 g_object_class_install_property(gobject_class
, PROP_WRITER
, spec
);
195 * gadataset_file_writer_write_record_batch:
196 * @writer: A #GADatasetFileWriter.
197 * @record_batch: A #GArrowRecordBatch to be written.
198 * @error: (nullable): Return location for a #GError or %NULL.
200 * Returns: %TRUE on success, %FALSE on error.
205 gadataset_file_writer_write_record_batch(GADatasetFileWriter
*writer
,
206 GArrowRecordBatch
*record_batch
,
209 const auto arrow_writer
= gadataset_file_writer_get_raw(writer
);
210 const auto arrow_record_batch
= garrow_record_batch_get_raw(record_batch
);
211 auto status
= arrow_writer
->Write(arrow_record_batch
);
212 return garrow::check(error
, status
, "[file-writer][write-record-batch]");
216 * gadataset_file_writer_write_record_batch_reader:
217 * @writer: A #GADatasetFileWriter.
218 * @reader: A #GArrowRecordBatchReader to be written.
219 * @error: (nullable): Return location for a #GError or %NULL.
221 * Returns: %TRUE on success, %FALSE on error.
226 gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter
*writer
,
227 GArrowRecordBatchReader
*reader
,
230 const auto arrow_writer
= gadataset_file_writer_get_raw(writer
);
231 auto arrow_reader
= garrow_record_batch_reader_get_raw(reader
);
232 auto status
= arrow_writer
->Write(arrow_reader
.get());
233 return garrow::check(error
,
235 "[file-writer][write-record-batch-reader]");
239 * gadataset_file_writer_finish:
240 * @writer: A #GADatasetFileWriter.
241 * @error: (nullable): Return location for a #GError or %NULL.
243 * Returns: %TRUE on success, %FALSE on error.
248 gadataset_file_writer_finish(GADatasetFileWriter
*writer
,
251 const auto arrow_writer
= gadataset_file_writer_get_raw(writer
);
252 auto status
= arrow_writer
->Finish();
253 return garrow::check(error
,
255 "[file-writer][finish]");
259 typedef struct GADatasetFileFormatPrivate_
{
260 std::shared_ptr
<arrow::dataset::FileFormat
> format
;
261 } GADatasetFileFormatPrivate
;
267 G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat
,
268 gadataset_file_format
,
271 #define GADATASET_FILE_FORMAT_GET_PRIVATE(obj) \
272 static_cast<GADatasetFileFormatPrivate *>( \
273 gadataset_file_format_get_instance_private( \
274 GADATASET_FILE_FORMAT(obj)))
277 gadataset_file_format_finalize(GObject
*object
)
279 auto priv
= GADATASET_FILE_FORMAT_GET_PRIVATE(object
);
280 priv
->format
.~shared_ptr();
281 G_OBJECT_CLASS(gadataset_file_format_parent_class
)->finalize(object
);
285 gadataset_file_format_set_property(GObject
*object
,
290 auto priv
= GADATASET_FILE_FORMAT_GET_PRIVATE(object
);
295 *static_cast<std::shared_ptr
<arrow::dataset::FileFormat
> *>(
296 g_value_get_pointer(value
));
299 G_OBJECT_WARN_INVALID_PROPERTY_ID(object
, prop_id
, pspec
);
305 gadataset_file_format_init(GADatasetFileFormat
*object
)
307 auto priv
= GADATASET_FILE_FORMAT_GET_PRIVATE(object
);
308 new(&priv
->format
) std::shared_ptr
<arrow::dataset::FileFormat
>;
312 gadataset_file_format_class_init(GADatasetFileFormatClass
*klass
)
314 auto gobject_class
= G_OBJECT_CLASS(klass
);
316 gobject_class
->finalize
= gadataset_file_format_finalize
;
317 gobject_class
->set_property
= gadataset_file_format_set_property
;
320 spec
= g_param_spec_pointer("format",
322 "The raw std::shared<arrow::dataset::FileFormat> *",
323 static_cast<GParamFlags
>(G_PARAM_WRITABLE
|
324 G_PARAM_CONSTRUCT_ONLY
));
325 g_object_class_install_property(gobject_class
, PROP_FORMAT
, spec
);
329 * gadataset_file_format_get_type_name:
330 * @format: A #GADatasetFileFormat.
332 * Returns: The type name of @format.
334 * It should be freed with g_free() when no longer needed.
339 gadataset_file_format_get_type_name(GADatasetFileFormat
*format
)
341 const auto arrow_format
= gadataset_file_format_get_raw(format
);
342 const auto &type_name
= arrow_format
->type_name();
343 return g_strndup(type_name
.data(), type_name
.size());
347 * gadataset_file_format_get_default_write_options:
348 * @format: A #GADatasetFileFormat.
350 * Returns: (transfer full): The default #GADatasetFileWriteOptions of @format.
354 GADatasetFileWriteOptions
*
355 gadataset_file_format_get_default_write_options(GADatasetFileFormat
*format
)
357 const auto arrow_format
= gadataset_file_format_get_raw(format
);
358 auto arrow_options
= arrow_format
->DefaultWriteOptions();
359 return gadataset_file_write_options_new_raw(&arrow_options
);
363 * gadataset_file_format_open_writer:
364 * @format: A #GADatasetFileFormat.
365 * @destination: A #GArrowOutputStream.
366 * @file_system: The #GArrowFileSystem of @destination.
367 * @path: The path of @destination.
368 * @schema: A #GArrowSchema that is used by written record batches.
369 * @options: A #GADatasetFileWriteOptions.
370 * @error: (nullable): Return location for a #GError or %NULL.
372 * Returns: (transfer full): The newly created #GADatasetFileWriter of @format
373 * on success, %NULL on error.
377 GADatasetFileWriter
*
378 gadataset_file_format_open_writer(GADatasetFileFormat
*format
,
379 GArrowOutputStream
*destination
,
380 GArrowFileSystem
*file_system
,
382 GArrowSchema
*schema
,
383 GADatasetFileWriteOptions
*options
,
386 const auto arrow_format
= gadataset_file_format_get_raw(format
);
387 auto arrow_destination
= garrow_output_stream_get_raw(destination
);
388 auto arrow_file_system
= garrow_file_system_get_raw(file_system
);
389 auto arrow_schema
= garrow_schema_get_raw(schema
);
390 auto arrow_options
= gadataset_file_write_options_get_raw(options
);
391 auto arrow_writer_result
=
392 arrow_format
->MakeWriter(arrow_destination
,
395 {arrow_file_system
, path
});
396 if (garrow::check(error
, arrow_writer_result
, "[file-format][open-writer]")) {
397 auto arrow_writer
= *arrow_writer_result
;
398 return gadataset_file_writer_new_raw(&arrow_writer
);
405 * gadataset_file_format_equal:
406 * @format: A #GADatasetFileFormat.
407 * @other_format: A #GADatasetFileFormat to be compared.
409 * Returns: %TRUE if they are the same content file format, %FALSE otherwise.
414 gadataset_file_format_equal(GADatasetFileFormat
*format
,
415 GADatasetFileFormat
*other_format
)
417 const auto arrow_format
= gadataset_file_format_get_raw(format
);
418 const auto arrow_other_format
= gadataset_file_format_get_raw(other_format
);
419 return arrow_format
->Equals(*arrow_other_format
);
423 G_DEFINE_TYPE(GADatasetCSVFileFormat
,
424 gadataset_csv_file_format
,
425 GADATASET_TYPE_FILE_FORMAT
)
428 gadataset_csv_file_format_init(GADatasetCSVFileFormat
*object
)
433 gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass
*klass
)
438 * gadataset_csv_file_format_new:
440 * Returns: The newly created CSV file format.
444 GADatasetCSVFileFormat
*
445 gadataset_csv_file_format_new(void)
447 std::shared_ptr
<arrow::dataset::FileFormat
> arrow_format
=
448 std::make_shared
<arrow::dataset::CsvFileFormat
>();
449 return GADATASET_CSV_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format
));
453 G_DEFINE_TYPE(GADatasetIPCFileFormat
,
454 gadataset_ipc_file_format
,
455 GADATASET_TYPE_FILE_FORMAT
)
458 gadataset_ipc_file_format_init(GADatasetIPCFileFormat
*object
)
463 gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass
*klass
)
468 * gadataset_ipc_file_format_new:
470 * Returns: The newly created IPC file format.
474 GADatasetIPCFileFormat
*
475 gadataset_ipc_file_format_new(void)
477 std::shared_ptr
<arrow::dataset::FileFormat
> arrow_format
=
478 std::make_shared
<arrow::dataset::IpcFileFormat
>();
479 return GADATASET_IPC_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format
));
483 G_DEFINE_TYPE(GADatasetParquetFileFormat
,
484 gadataset_parquet_file_format
,
485 GADATASET_TYPE_FILE_FORMAT
)
488 gadataset_parquet_file_format_init(GADatasetParquetFileFormat
*object
)
493 gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass
*klass
)
498 * gadataset_parquet_file_format_new:
500 * Returns: The newly created Parquet file format.
504 GADatasetParquetFileFormat
*
505 gadataset_parquet_file_format_new(void)
507 std::shared_ptr
<arrow::dataset::FileFormat
> arrow_format
=
508 std::make_shared
<arrow::dataset::ParquetFileFormat
>();
509 return GADATASET_PARQUET_FILE_FORMAT(
510 gadataset_file_format_new_raw(&arrow_format
));
516 GADatasetFileWriteOptions
*
517 gadataset_file_write_options_new_raw(
518 std::shared_ptr
<arrow::dataset::FileWriteOptions
> *arrow_options
)
520 return GADATASET_FILE_WRITE_OPTIONS(
521 g_object_new(GADATASET_TYPE_FILE_WRITE_OPTIONS
,
522 "options", arrow_options
,
526 std::shared_ptr
<arrow::dataset::FileWriteOptions
>
527 gadataset_file_write_options_get_raw(GADatasetFileWriteOptions
*options
)
529 auto priv
= GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(options
);
530 return priv
->options
;
534 GADatasetFileWriter
*
535 gadataset_file_writer_new_raw(
536 std::shared_ptr
<arrow::dataset::FileWriter
> *arrow_writer
)
538 return GADATASET_FILE_WRITER(g_object_new(GADATASET_TYPE_FILE_WRITER
,
539 "writer", arrow_writer
,
543 std::shared_ptr
<arrow::dataset::FileWriter
>
544 gadataset_file_writer_get_raw(GADatasetFileWriter
*writer
)
546 auto priv
= GADATASET_FILE_WRITER_GET_PRIVATE(writer
);
551 GADatasetFileFormat
*
552 gadataset_file_format_new_raw(
553 std::shared_ptr
<arrow::dataset::FileFormat
> *arrow_format
)
555 GType type
= GADATASET_TYPE_FILE_FORMAT
;
556 const auto &type_name
= (*arrow_format
)->type_name();
557 if (type_name
== "csv") {
558 type
= GADATASET_TYPE_CSV_FILE_FORMAT
;
559 } else if (type_name
== "ipc") {
560 type
= GADATASET_TYPE_IPC_FILE_FORMAT
;
561 } else if (type_name
== "parquet") {
562 type
= GADATASET_TYPE_PARQUET_FILE_FORMAT
;
564 return GADATASET_FILE_FORMAT(g_object_new(type
,
565 "format", arrow_format
,
569 std::shared_ptr
<arrow::dataset::FileFormat
>
570 gadataset_file_format_get_raw(GADatasetFileFormat
*format
)
572 auto priv
= GADATASET_FILE_FORMAT_GET_PRIVATE(format
);