]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / c_glib / arrow-dataset-glib / file-format.cpp
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 #include <arrow-glib/error.hpp>
21 #include <arrow-glib/file-system.hpp>
22 #include <arrow-glib/output-stream.hpp>
23 #include <arrow-glib/record-batch.hpp>
24 #include <arrow-glib/reader.hpp>
25 #include <arrow-glib/schema.hpp>
26
27 #include <arrow-dataset-glib/file-format.hpp>
28
29 G_BEGIN_DECLS
30
31 /**
32 * SECTION: file-format
33 * @section_id: file-format
34 * @title: File format classes
35 * @include: arrow-dataset-glib/arrow-dataset-glib.h
36 *
37 * #GADatasetFileWriteOptions is a class for options to write a file
38 * of this format.
39 *
40 * #GADatasetFileWriter is a class for writing a file of this format.
41 *
42 * #GADatasetFileFormat is a base class for file format classes.
43 *
44 * #GADatasetCSVFileFormat is a class for CSV file format.
45 *
46 * #GADatasetIPCFileFormat is a class for IPC file format.
47 *
48 * #GADatasetParquetFileFormat is a class for Parquet file format.
49 *
50 * Since: 3.0.0
51 */
52
53 typedef struct GADatasetFileWriteOptionsPrivate_ {
54 std::shared_ptr<arrow::dataset::FileWriteOptions> options;
55 } GADatasetFileWriteOptionsPrivate;
56
57 enum {
58 PROP_OPTIONS = 1,
59 };
60
61 G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriteOptions,
62 gadataset_file_write_options,
63 G_TYPE_OBJECT)
64
65 #define GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(obj) \
66 static_cast<GADatasetFileWriteOptionsPrivate *>( \
67 gadataset_file_write_options_get_instance_private( \
68 GADATASET_FILE_WRITE_OPTIONS(obj)))
69
70 static void
71 gadataset_file_write_options_finalize(GObject *object)
72 {
73 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
74 priv->options.~shared_ptr();
75 G_OBJECT_CLASS(gadataset_file_write_options_parent_class)->finalize(object);
76 }
77
78 static void
79 gadataset_file_write_options_set_property(GObject *object,
80 guint prop_id,
81 const GValue *value,
82 GParamSpec *pspec)
83 {
84 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
85
86 switch (prop_id) {
87 case PROP_OPTIONS:
88 priv->options =
89 *static_cast<std::shared_ptr<arrow::dataset::FileWriteOptions> *>(
90 g_value_get_pointer(value));
91 break;
92 default:
93 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
94 break;
95 }
96 }
97
98 static void
99 gadataset_file_write_options_init(GADatasetFileWriteOptions *object)
100 {
101 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
102 new(&priv->options) std::shared_ptr<arrow::dataset::FileWriteOptions>;
103 }
104
105 static void
106 gadataset_file_write_options_class_init(GADatasetFileWriteOptionsClass *klass)
107 {
108 auto gobject_class = G_OBJECT_CLASS(klass);
109
110 gobject_class->finalize = gadataset_file_write_options_finalize;
111 gobject_class->set_property = gadataset_file_write_options_set_property;
112
113 GParamSpec *spec;
114 spec = g_param_spec_pointer("options",
115 "Options",
116 "The raw "
117 "std::shared<arrow::dataset::FileWriteOptions> *",
118 static_cast<GParamFlags>(G_PARAM_WRITABLE |
119 G_PARAM_CONSTRUCT_ONLY));
120 g_object_class_install_property(gobject_class, PROP_OPTIONS, spec);
121 }
122
123
124 typedef struct GADatasetFileWriterPrivate_ {
125 std::shared_ptr<arrow::dataset::FileWriter> writer;
126 } GADatasetFileWriterPrivate;
127
128 enum {
129 PROP_WRITER = 1,
130 };
131
132 G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriter,
133 gadataset_file_writer,
134 G_TYPE_OBJECT)
135
136 #define GADATASET_FILE_WRITER_GET_PRIVATE(obj) \
137 static_cast<GADatasetFileWriterPrivate *>( \
138 gadataset_file_writer_get_instance_private( \
139 GADATASET_FILE_WRITER(obj)))
140
141 static void
142 gadataset_file_writer_finalize(GObject *object)
143 {
144 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
145 priv->writer.~shared_ptr();
146 G_OBJECT_CLASS(gadataset_file_writer_parent_class)->finalize(object);
147 }
148
149 static void
150 gadataset_file_writer_set_property(GObject *object,
151 guint prop_id,
152 const GValue *value,
153 GParamSpec *pspec)
154 {
155 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
156
157 switch (prop_id) {
158 case PROP_WRITER:
159 priv->writer =
160 *static_cast<std::shared_ptr<arrow::dataset::FileWriter> *>(
161 g_value_get_pointer(value));
162 break;
163 default:
164 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
165 break;
166 }
167 }
168
169 static void
170 gadataset_file_writer_init(GADatasetFileWriter *object)
171 {
172 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
173 new(&(priv->writer)) std::shared_ptr<arrow::dataset::FileWriter>;
174 }
175
176 static void
177 gadataset_file_writer_class_init(GADatasetFileWriterClass *klass)
178 {
179 auto gobject_class = G_OBJECT_CLASS(klass);
180
181 gobject_class->finalize = gadataset_file_writer_finalize;
182 gobject_class->set_property = gadataset_file_writer_set_property;
183
184 GParamSpec *spec;
185 spec = g_param_spec_pointer("writer",
186 "Writer",
187 "The raw "
188 "std::shared<arrow::dataset::FileWriter> *",
189 static_cast<GParamFlags>(G_PARAM_WRITABLE |
190 G_PARAM_CONSTRUCT_ONLY));
191 g_object_class_install_property(gobject_class, PROP_WRITER, spec);
192 }
193
194 /**
195 * gadataset_file_writer_write_record_batch:
196 * @writer: A #GADatasetFileWriter.
197 * @record_batch: A #GArrowRecordBatch to be written.
198 * @error: (nullable): Return location for a #GError or %NULL.
199 *
200 * Returns: %TRUE on success, %FALSE on error.
201 *
202 * Since: 6.0.0
203 */
204 gboolean
205 gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer,
206 GArrowRecordBatch *record_batch,
207 GError **error)
208 {
209 const auto arrow_writer = gadataset_file_writer_get_raw(writer);
210 const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
211 auto status = arrow_writer->Write(arrow_record_batch);
212 return garrow::check(error, status, "[file-writer][write-record-batch]");
213 }
214
215 /**
216 * gadataset_file_writer_write_record_batch_reader:
217 * @writer: A #GADatasetFileWriter.
218 * @reader: A #GArrowRecordBatchReader to be written.
219 * @error: (nullable): Return location for a #GError or %NULL.
220 *
221 * Returns: %TRUE on success, %FALSE on error.
222 *
223 * Since: 6.0.0
224 */
225 gboolean
226 gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer,
227 GArrowRecordBatchReader *reader,
228 GError **error)
229 {
230 const auto arrow_writer = gadataset_file_writer_get_raw(writer);
231 auto arrow_reader = garrow_record_batch_reader_get_raw(reader);
232 auto status = arrow_writer->Write(arrow_reader.get());
233 return garrow::check(error,
234 status,
235 "[file-writer][write-record-batch-reader]");
236 }
237
238 /**
239 * gadataset_file_writer_finish:
240 * @writer: A #GADatasetFileWriter.
241 * @error: (nullable): Return location for a #GError or %NULL.
242 *
243 * Returns: %TRUE on success, %FALSE on error.
244 *
245 * Since: 6.0.0
246 */
247 gboolean
248 gadataset_file_writer_finish(GADatasetFileWriter *writer,
249 GError **error)
250 {
251 const auto arrow_writer = gadataset_file_writer_get_raw(writer);
252 auto status = arrow_writer->Finish();
253 return garrow::check(error,
254 status,
255 "[file-writer][finish]");
256 }
257
258
259 typedef struct GADatasetFileFormatPrivate_ {
260 std::shared_ptr<arrow::dataset::FileFormat> format;
261 } GADatasetFileFormatPrivate;
262
263 enum {
264 PROP_FORMAT = 1,
265 };
266
267 G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat,
268 gadataset_file_format,
269 G_TYPE_OBJECT)
270
271 #define GADATASET_FILE_FORMAT_GET_PRIVATE(obj) \
272 static_cast<GADatasetFileFormatPrivate *>( \
273 gadataset_file_format_get_instance_private( \
274 GADATASET_FILE_FORMAT(obj)))
275
276 static void
277 gadataset_file_format_finalize(GObject *object)
278 {
279 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
280 priv->format.~shared_ptr();
281 G_OBJECT_CLASS(gadataset_file_format_parent_class)->finalize(object);
282 }
283
284 static void
285 gadataset_file_format_set_property(GObject *object,
286 guint prop_id,
287 const GValue *value,
288 GParamSpec *pspec)
289 {
290 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
291
292 switch (prop_id) {
293 case PROP_FORMAT:
294 priv->format =
295 *static_cast<std::shared_ptr<arrow::dataset::FileFormat> *>(
296 g_value_get_pointer(value));
297 break;
298 default:
299 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
300 break;
301 }
302 }
303
304 static void
305 gadataset_file_format_init(GADatasetFileFormat *object)
306 {
307 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
308 new(&priv->format) std::shared_ptr<arrow::dataset::FileFormat>;
309 }
310
311 static void
312 gadataset_file_format_class_init(GADatasetFileFormatClass *klass)
313 {
314 auto gobject_class = G_OBJECT_CLASS(klass);
315
316 gobject_class->finalize = gadataset_file_format_finalize;
317 gobject_class->set_property = gadataset_file_format_set_property;
318
319 GParamSpec *spec;
320 spec = g_param_spec_pointer("format",
321 "Format",
322 "The raw std::shared<arrow::dataset::FileFormat> *",
323 static_cast<GParamFlags>(G_PARAM_WRITABLE |
324 G_PARAM_CONSTRUCT_ONLY));
325 g_object_class_install_property(gobject_class, PROP_FORMAT, spec);
326 }
327
328 /**
329 * gadataset_file_format_get_type_name:
330 * @format: A #GADatasetFileFormat.
331 *
332 * Returns: The type name of @format.
333 *
334 * It should be freed with g_free() when no longer needed.
335 *
336 * Since: 3.0.0
337 */
338 gchar *
339 gadataset_file_format_get_type_name(GADatasetFileFormat *format)
340 {
341 const auto arrow_format = gadataset_file_format_get_raw(format);
342 const auto &type_name = arrow_format->type_name();
343 return g_strndup(type_name.data(), type_name.size());
344 }
345
346 /**
347 * gadataset_file_format_get_default_write_options:
348 * @format: A #GADatasetFileFormat.
349 *
350 * Returns: (transfer full): The default #GADatasetFileWriteOptions of @format.
351 *
352 * Since: 6.0.0
353 */
354 GADatasetFileWriteOptions *
355 gadataset_file_format_get_default_write_options(GADatasetFileFormat *format)
356 {
357 const auto arrow_format = gadataset_file_format_get_raw(format);
358 auto arrow_options = arrow_format->DefaultWriteOptions();
359 return gadataset_file_write_options_new_raw(&arrow_options);
360 }
361
362 /**
363 * gadataset_file_format_open_writer:
364 * @format: A #GADatasetFileFormat.
365 * @destination: A #GArrowOutputStream.
366 * @file_system: The #GArrowFileSystem of @destination.
367 * @path: The path of @destination.
368 * @schema: A #GArrowSchema that is used by written record batches.
369 * @options: A #GADatasetFileWriteOptions.
370 * @error: (nullable): Return location for a #GError or %NULL.
371 *
372 * Returns: (transfer full): The newly created #GADatasetFileWriter of @format
373 * on success, %NULL on error.
374 *
375 * Since: 6.0.0
376 */
377 GADatasetFileWriter *
378 gadataset_file_format_open_writer(GADatasetFileFormat *format,
379 GArrowOutputStream *destination,
380 GArrowFileSystem *file_system,
381 const gchar *path,
382 GArrowSchema *schema,
383 GADatasetFileWriteOptions *options,
384 GError **error)
385 {
386 const auto arrow_format = gadataset_file_format_get_raw(format);
387 auto arrow_destination = garrow_output_stream_get_raw(destination);
388 auto arrow_file_system = garrow_file_system_get_raw(file_system);
389 auto arrow_schema = garrow_schema_get_raw(schema);
390 auto arrow_options = gadataset_file_write_options_get_raw(options);
391 auto arrow_writer_result =
392 arrow_format->MakeWriter(arrow_destination,
393 arrow_schema,
394 arrow_options,
395 {arrow_file_system, path});
396 if (garrow::check(error, arrow_writer_result, "[file-format][open-writer]")) {
397 auto arrow_writer = *arrow_writer_result;
398 return gadataset_file_writer_new_raw(&arrow_writer);
399 } else {
400 return NULL;
401 }
402 }
403
404 /**
405 * gadataset_file_format_equal:
406 * @format: A #GADatasetFileFormat.
407 * @other_format: A #GADatasetFileFormat to be compared.
408 *
409 * Returns: %TRUE if they are the same content file format, %FALSE otherwise.
410 *
411 * Since: 3.0.0
412 */
413 gboolean
414 gadataset_file_format_equal(GADatasetFileFormat *format,
415 GADatasetFileFormat *other_format)
416 {
417 const auto arrow_format = gadataset_file_format_get_raw(format);
418 const auto arrow_other_format = gadataset_file_format_get_raw(other_format);
419 return arrow_format->Equals(*arrow_other_format);
420 }
421
422
423 G_DEFINE_TYPE(GADatasetCSVFileFormat,
424 gadataset_csv_file_format,
425 GADATASET_TYPE_FILE_FORMAT)
426
427 static void
428 gadataset_csv_file_format_init(GADatasetCSVFileFormat *object)
429 {
430 }
431
432 static void
433 gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass *klass)
434 {
435 }
436
437 /**
438 * gadataset_csv_file_format_new:
439 *
440 * Returns: The newly created CSV file format.
441 *
442 * Since: 3.0.0
443 */
444 GADatasetCSVFileFormat *
445 gadataset_csv_file_format_new(void)
446 {
447 std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
448 std::make_shared<arrow::dataset::CsvFileFormat>();
449 return GADATASET_CSV_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format));
450 }
451
452
453 G_DEFINE_TYPE(GADatasetIPCFileFormat,
454 gadataset_ipc_file_format,
455 GADATASET_TYPE_FILE_FORMAT)
456
457 static void
458 gadataset_ipc_file_format_init(GADatasetIPCFileFormat *object)
459 {
460 }
461
462 static void
463 gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass *klass)
464 {
465 }
466
467 /**
468 * gadataset_ipc_file_format_new:
469 *
470 * Returns: The newly created IPC file format.
471 *
472 * Since: 3.0.0
473 */
474 GADatasetIPCFileFormat *
475 gadataset_ipc_file_format_new(void)
476 {
477 std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
478 std::make_shared<arrow::dataset::IpcFileFormat>();
479 return GADATASET_IPC_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format));
480 }
481
482
483 G_DEFINE_TYPE(GADatasetParquetFileFormat,
484 gadataset_parquet_file_format,
485 GADATASET_TYPE_FILE_FORMAT)
486
487 static void
488 gadataset_parquet_file_format_init(GADatasetParquetFileFormat *object)
489 {
490 }
491
492 static void
493 gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass *klass)
494 {
495 }
496
497 /**
498 * gadataset_parquet_file_format_new:
499 *
500 * Returns: The newly created Parquet file format.
501 *
502 * Since: 3.0.0
503 */
504 GADatasetParquetFileFormat *
505 gadataset_parquet_file_format_new(void)
506 {
507 std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
508 std::make_shared<arrow::dataset::ParquetFileFormat>();
509 return GADATASET_PARQUET_FILE_FORMAT(
510 gadataset_file_format_new_raw(&arrow_format));
511 }
512
513
514 G_END_DECLS
515
516 GADatasetFileWriteOptions *
517 gadataset_file_write_options_new_raw(
518 std::shared_ptr<arrow::dataset::FileWriteOptions> *arrow_options)
519 {
520 return GADATASET_FILE_WRITE_OPTIONS(
521 g_object_new(GADATASET_TYPE_FILE_WRITE_OPTIONS,
522 "options", arrow_options,
523 NULL));
524 }
525
526 std::shared_ptr<arrow::dataset::FileWriteOptions>
527 gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options)
528 {
529 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(options);
530 return priv->options;
531 }
532
533
534 GADatasetFileWriter *
535 gadataset_file_writer_new_raw(
536 std::shared_ptr<arrow::dataset::FileWriter> *arrow_writer)
537 {
538 return GADATASET_FILE_WRITER(g_object_new(GADATASET_TYPE_FILE_WRITER,
539 "writer", arrow_writer,
540 NULL));
541 }
542
543 std::shared_ptr<arrow::dataset::FileWriter>
544 gadataset_file_writer_get_raw(GADatasetFileWriter *writer)
545 {
546 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(writer);
547 return priv->writer;
548 }
549
550
551 GADatasetFileFormat *
552 gadataset_file_format_new_raw(
553 std::shared_ptr<arrow::dataset::FileFormat> *arrow_format)
554 {
555 GType type = GADATASET_TYPE_FILE_FORMAT;
556 const auto &type_name = (*arrow_format)->type_name();
557 if (type_name == "csv") {
558 type = GADATASET_TYPE_CSV_FILE_FORMAT;
559 } else if (type_name == "ipc") {
560 type = GADATASET_TYPE_IPC_FILE_FORMAT;
561 } else if (type_name == "parquet") {
562 type = GADATASET_TYPE_PARQUET_FILE_FORMAT;
563 }
564 return GADATASET_FILE_FORMAT(g_object_new(type,
565 "format", arrow_format,
566 NULL));
567 }
568
569 std::shared_ptr<arrow::dataset::FileFormat>
570 gadataset_file_format_get_raw(GADatasetFileFormat *format)
571 {
572 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(format);
573 return priv->format;
574 }