]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / c_glib / arrow-dataset-glib / file-format.cpp
CommitLineData
1d09f67e
TL
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#include <arrow-glib/error.hpp>
21#include <arrow-glib/file-system.hpp>
22#include <arrow-glib/output-stream.hpp>
23#include <arrow-glib/record-batch.hpp>
24#include <arrow-glib/reader.hpp>
25#include <arrow-glib/schema.hpp>
26
27#include <arrow-dataset-glib/file-format.hpp>
28
29G_BEGIN_DECLS
30
31/**
32 * SECTION: file-format
33 * @section_id: file-format
34 * @title: File format classes
35 * @include: arrow-dataset-glib/arrow-dataset-glib.h
36 *
37 * #GADatasetFileWriteOptions is a class for options to write a file
38 * of this format.
39 *
40 * #GADatasetFileWriter is a class for writing a file of this format.
41 *
42 * #GADatasetFileFormat is a base class for file format classes.
43 *
44 * #GADatasetCSVFileFormat is a class for CSV file format.
45 *
46 * #GADatasetIPCFileFormat is a class for IPC file format.
47 *
48 * #GADatasetParquetFileFormat is a class for Parquet file format.
49 *
50 * Since: 3.0.0
51 */
52
53typedef struct GADatasetFileWriteOptionsPrivate_ {
54 std::shared_ptr<arrow::dataset::FileWriteOptions> options;
55} GADatasetFileWriteOptionsPrivate;
56
57enum {
58 PROP_OPTIONS = 1,
59};
60
61G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriteOptions,
62 gadataset_file_write_options,
63 G_TYPE_OBJECT)
64
65#define GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(obj) \
66 static_cast<GADatasetFileWriteOptionsPrivate *>( \
67 gadataset_file_write_options_get_instance_private( \
68 GADATASET_FILE_WRITE_OPTIONS(obj)))
69
70static void
71gadataset_file_write_options_finalize(GObject *object)
72{
73 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
74 priv->options.~shared_ptr();
75 G_OBJECT_CLASS(gadataset_file_write_options_parent_class)->finalize(object);
76}
77
78static void
79gadataset_file_write_options_set_property(GObject *object,
80 guint prop_id,
81 const GValue *value,
82 GParamSpec *pspec)
83{
84 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
85
86 switch (prop_id) {
87 case PROP_OPTIONS:
88 priv->options =
89 *static_cast<std::shared_ptr<arrow::dataset::FileWriteOptions> *>(
90 g_value_get_pointer(value));
91 break;
92 default:
93 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
94 break;
95 }
96}
97
98static void
99gadataset_file_write_options_init(GADatasetFileWriteOptions *object)
100{
101 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
102 new(&priv->options) std::shared_ptr<arrow::dataset::FileWriteOptions>;
103}
104
105static void
106gadataset_file_write_options_class_init(GADatasetFileWriteOptionsClass *klass)
107{
108 auto gobject_class = G_OBJECT_CLASS(klass);
109
110 gobject_class->finalize = gadataset_file_write_options_finalize;
111 gobject_class->set_property = gadataset_file_write_options_set_property;
112
113 GParamSpec *spec;
114 spec = g_param_spec_pointer("options",
115 "Options",
116 "The raw "
117 "std::shared<arrow::dataset::FileWriteOptions> *",
118 static_cast<GParamFlags>(G_PARAM_WRITABLE |
119 G_PARAM_CONSTRUCT_ONLY));
120 g_object_class_install_property(gobject_class, PROP_OPTIONS, spec);
121}
122
123
124typedef struct GADatasetFileWriterPrivate_ {
125 std::shared_ptr<arrow::dataset::FileWriter> writer;
126} GADatasetFileWriterPrivate;
127
128enum {
129 PROP_WRITER = 1,
130};
131
132G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriter,
133 gadataset_file_writer,
134 G_TYPE_OBJECT)
135
136#define GADATASET_FILE_WRITER_GET_PRIVATE(obj) \
137 static_cast<GADatasetFileWriterPrivate *>( \
138 gadataset_file_writer_get_instance_private( \
139 GADATASET_FILE_WRITER(obj)))
140
141static void
142gadataset_file_writer_finalize(GObject *object)
143{
144 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
145 priv->writer.~shared_ptr();
146 G_OBJECT_CLASS(gadataset_file_writer_parent_class)->finalize(object);
147}
148
149static void
150gadataset_file_writer_set_property(GObject *object,
151 guint prop_id,
152 const GValue *value,
153 GParamSpec *pspec)
154{
155 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
156
157 switch (prop_id) {
158 case PROP_WRITER:
159 priv->writer =
160 *static_cast<std::shared_ptr<arrow::dataset::FileWriter> *>(
161 g_value_get_pointer(value));
162 break;
163 default:
164 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
165 break;
166 }
167}
168
169static void
170gadataset_file_writer_init(GADatasetFileWriter *object)
171{
172 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
173 new(&(priv->writer)) std::shared_ptr<arrow::dataset::FileWriter>;
174}
175
176static void
177gadataset_file_writer_class_init(GADatasetFileWriterClass *klass)
178{
179 auto gobject_class = G_OBJECT_CLASS(klass);
180
181 gobject_class->finalize = gadataset_file_writer_finalize;
182 gobject_class->set_property = gadataset_file_writer_set_property;
183
184 GParamSpec *spec;
185 spec = g_param_spec_pointer("writer",
186 "Writer",
187 "The raw "
188 "std::shared<arrow::dataset::FileWriter> *",
189 static_cast<GParamFlags>(G_PARAM_WRITABLE |
190 G_PARAM_CONSTRUCT_ONLY));
191 g_object_class_install_property(gobject_class, PROP_WRITER, spec);
192}
193
194/**
195 * gadataset_file_writer_write_record_batch:
196 * @writer: A #GADatasetFileWriter.
197 * @record_batch: A #GArrowRecordBatch to be written.
198 * @error: (nullable): Return location for a #GError or %NULL.
199 *
200 * Returns: %TRUE on success, %FALSE on error.
201 *
202 * Since: 6.0.0
203 */
204gboolean
205gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer,
206 GArrowRecordBatch *record_batch,
207 GError **error)
208{
209 const auto arrow_writer = gadataset_file_writer_get_raw(writer);
210 const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
211 auto status = arrow_writer->Write(arrow_record_batch);
212 return garrow::check(error, status, "[file-writer][write-record-batch]");
213}
214
215/**
216 * gadataset_file_writer_write_record_batch_reader:
217 * @writer: A #GADatasetFileWriter.
218 * @reader: A #GArrowRecordBatchReader to be written.
219 * @error: (nullable): Return location for a #GError or %NULL.
220 *
221 * Returns: %TRUE on success, %FALSE on error.
222 *
223 * Since: 6.0.0
224 */
225gboolean
226gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer,
227 GArrowRecordBatchReader *reader,
228 GError **error)
229{
230 const auto arrow_writer = gadataset_file_writer_get_raw(writer);
231 auto arrow_reader = garrow_record_batch_reader_get_raw(reader);
232 auto status = arrow_writer->Write(arrow_reader.get());
233 return garrow::check(error,
234 status,
235 "[file-writer][write-record-batch-reader]");
236}
237
238/**
239 * gadataset_file_writer_finish:
240 * @writer: A #GADatasetFileWriter.
241 * @error: (nullable): Return location for a #GError or %NULL.
242 *
243 * Returns: %TRUE on success, %FALSE on error.
244 *
245 * Since: 6.0.0
246 */
247gboolean
248gadataset_file_writer_finish(GADatasetFileWriter *writer,
249 GError **error)
250{
251 const auto arrow_writer = gadataset_file_writer_get_raw(writer);
252 auto status = arrow_writer->Finish();
253 return garrow::check(error,
254 status,
255 "[file-writer][finish]");
256}
257
258
259typedef struct GADatasetFileFormatPrivate_ {
260 std::shared_ptr<arrow::dataset::FileFormat> format;
261} GADatasetFileFormatPrivate;
262
263enum {
264 PROP_FORMAT = 1,
265};
266
267G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat,
268 gadataset_file_format,
269 G_TYPE_OBJECT)
270
271#define GADATASET_FILE_FORMAT_GET_PRIVATE(obj) \
272 static_cast<GADatasetFileFormatPrivate *>( \
273 gadataset_file_format_get_instance_private( \
274 GADATASET_FILE_FORMAT(obj)))
275
276static void
277gadataset_file_format_finalize(GObject *object)
278{
279 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
280 priv->format.~shared_ptr();
281 G_OBJECT_CLASS(gadataset_file_format_parent_class)->finalize(object);
282}
283
284static void
285gadataset_file_format_set_property(GObject *object,
286 guint prop_id,
287 const GValue *value,
288 GParamSpec *pspec)
289{
290 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
291
292 switch (prop_id) {
293 case PROP_FORMAT:
294 priv->format =
295 *static_cast<std::shared_ptr<arrow::dataset::FileFormat> *>(
296 g_value_get_pointer(value));
297 break;
298 default:
299 G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
300 break;
301 }
302}
303
304static void
305gadataset_file_format_init(GADatasetFileFormat *object)
306{
307 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
308 new(&priv->format) std::shared_ptr<arrow::dataset::FileFormat>;
309}
310
311static void
312gadataset_file_format_class_init(GADatasetFileFormatClass *klass)
313{
314 auto gobject_class = G_OBJECT_CLASS(klass);
315
316 gobject_class->finalize = gadataset_file_format_finalize;
317 gobject_class->set_property = gadataset_file_format_set_property;
318
319 GParamSpec *spec;
320 spec = g_param_spec_pointer("format",
321 "Format",
322 "The raw std::shared<arrow::dataset::FileFormat> *",
323 static_cast<GParamFlags>(G_PARAM_WRITABLE |
324 G_PARAM_CONSTRUCT_ONLY));
325 g_object_class_install_property(gobject_class, PROP_FORMAT, spec);
326}
327
328/**
329 * gadataset_file_format_get_type_name:
330 * @format: A #GADatasetFileFormat.
331 *
332 * Returns: The type name of @format.
333 *
334 * It should be freed with g_free() when no longer needed.
335 *
336 * Since: 3.0.0
337 */
338gchar *
339gadataset_file_format_get_type_name(GADatasetFileFormat *format)
340{
341 const auto arrow_format = gadataset_file_format_get_raw(format);
342 const auto &type_name = arrow_format->type_name();
343 return g_strndup(type_name.data(), type_name.size());
344}
345
346/**
347 * gadataset_file_format_get_default_write_options:
348 * @format: A #GADatasetFileFormat.
349 *
350 * Returns: (transfer full): The default #GADatasetFileWriteOptions of @format.
351 *
352 * Since: 6.0.0
353 */
354GADatasetFileWriteOptions *
355gadataset_file_format_get_default_write_options(GADatasetFileFormat *format)
356{
357 const auto arrow_format = gadataset_file_format_get_raw(format);
358 auto arrow_options = arrow_format->DefaultWriteOptions();
359 return gadataset_file_write_options_new_raw(&arrow_options);
360}
361
362/**
363 * gadataset_file_format_open_writer:
364 * @format: A #GADatasetFileFormat.
365 * @destination: A #GArrowOutputStream.
366 * @file_system: The #GArrowFileSystem of @destination.
367 * @path: The path of @destination.
368 * @schema: A #GArrowSchema that is used by written record batches.
369 * @options: A #GADatasetFileWriteOptions.
370 * @error: (nullable): Return location for a #GError or %NULL.
371 *
372 * Returns: (transfer full): The newly created #GADatasetFileWriter of @format
373 * on success, %NULL on error.
374 *
375 * Since: 6.0.0
376 */
377GADatasetFileWriter *
378gadataset_file_format_open_writer(GADatasetFileFormat *format,
379 GArrowOutputStream *destination,
380 GArrowFileSystem *file_system,
381 const gchar *path,
382 GArrowSchema *schema,
383 GADatasetFileWriteOptions *options,
384 GError **error)
385{
386 const auto arrow_format = gadataset_file_format_get_raw(format);
387 auto arrow_destination = garrow_output_stream_get_raw(destination);
388 auto arrow_file_system = garrow_file_system_get_raw(file_system);
389 auto arrow_schema = garrow_schema_get_raw(schema);
390 auto arrow_options = gadataset_file_write_options_get_raw(options);
391 auto arrow_writer_result =
392 arrow_format->MakeWriter(arrow_destination,
393 arrow_schema,
394 arrow_options,
395 {arrow_file_system, path});
396 if (garrow::check(error, arrow_writer_result, "[file-format][open-writer]")) {
397 auto arrow_writer = *arrow_writer_result;
398 return gadataset_file_writer_new_raw(&arrow_writer);
399 } else {
400 return NULL;
401 }
402}
403
404/**
405 * gadataset_file_format_equal:
406 * @format: A #GADatasetFileFormat.
407 * @other_format: A #GADatasetFileFormat to be compared.
408 *
409 * Returns: %TRUE if they are the same content file format, %FALSE otherwise.
410 *
411 * Since: 3.0.0
412 */
413gboolean
414gadataset_file_format_equal(GADatasetFileFormat *format,
415 GADatasetFileFormat *other_format)
416{
417 const auto arrow_format = gadataset_file_format_get_raw(format);
418 const auto arrow_other_format = gadataset_file_format_get_raw(other_format);
419 return arrow_format->Equals(*arrow_other_format);
420}
421
422
423G_DEFINE_TYPE(GADatasetCSVFileFormat,
424 gadataset_csv_file_format,
425 GADATASET_TYPE_FILE_FORMAT)
426
427static void
428gadataset_csv_file_format_init(GADatasetCSVFileFormat *object)
429{
430}
431
432static void
433gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass *klass)
434{
435}
436
437/**
438 * gadataset_csv_file_format_new:
439 *
440 * Returns: The newly created CSV file format.
441 *
442 * Since: 3.0.0
443 */
444GADatasetCSVFileFormat *
445gadataset_csv_file_format_new(void)
446{
447 std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
448 std::make_shared<arrow::dataset::CsvFileFormat>();
449 return GADATASET_CSV_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format));
450}
451
452
453G_DEFINE_TYPE(GADatasetIPCFileFormat,
454 gadataset_ipc_file_format,
455 GADATASET_TYPE_FILE_FORMAT)
456
457static void
458gadataset_ipc_file_format_init(GADatasetIPCFileFormat *object)
459{
460}
461
462static void
463gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass *klass)
464{
465}
466
467/**
468 * gadataset_ipc_file_format_new:
469 *
470 * Returns: The newly created IPC file format.
471 *
472 * Since: 3.0.0
473 */
474GADatasetIPCFileFormat *
475gadataset_ipc_file_format_new(void)
476{
477 std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
478 std::make_shared<arrow::dataset::IpcFileFormat>();
479 return GADATASET_IPC_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format));
480}
481
482
483G_DEFINE_TYPE(GADatasetParquetFileFormat,
484 gadataset_parquet_file_format,
485 GADATASET_TYPE_FILE_FORMAT)
486
487static void
488gadataset_parquet_file_format_init(GADatasetParquetFileFormat *object)
489{
490}
491
492static void
493gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass *klass)
494{
495}
496
497/**
498 * gadataset_parquet_file_format_new:
499 *
500 * Returns: The newly created Parquet file format.
501 *
502 * Since: 3.0.0
503 */
504GADatasetParquetFileFormat *
505gadataset_parquet_file_format_new(void)
506{
507 std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
508 std::make_shared<arrow::dataset::ParquetFileFormat>();
509 return GADATASET_PARQUET_FILE_FORMAT(
510 gadataset_file_format_new_raw(&arrow_format));
511}
512
513
514G_END_DECLS
515
516GADatasetFileWriteOptions *
517gadataset_file_write_options_new_raw(
518 std::shared_ptr<arrow::dataset::FileWriteOptions> *arrow_options)
519{
520 return GADATASET_FILE_WRITE_OPTIONS(
521 g_object_new(GADATASET_TYPE_FILE_WRITE_OPTIONS,
522 "options", arrow_options,
523 NULL));
524}
525
526std::shared_ptr<arrow::dataset::FileWriteOptions>
527gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options)
528{
529 auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(options);
530 return priv->options;
531}
532
533
534GADatasetFileWriter *
535gadataset_file_writer_new_raw(
536 std::shared_ptr<arrow::dataset::FileWriter> *arrow_writer)
537{
538 return GADATASET_FILE_WRITER(g_object_new(GADATASET_TYPE_FILE_WRITER,
539 "writer", arrow_writer,
540 NULL));
541}
542
543std::shared_ptr<arrow::dataset::FileWriter>
544gadataset_file_writer_get_raw(GADatasetFileWriter *writer)
545{
546 auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(writer);
547 return priv->writer;
548}
549
550
551GADatasetFileFormat *
552gadataset_file_format_new_raw(
553 std::shared_ptr<arrow::dataset::FileFormat> *arrow_format)
554{
555 GType type = GADATASET_TYPE_FILE_FORMAT;
556 const auto &type_name = (*arrow_format)->type_name();
557 if (type_name == "csv") {
558 type = GADATASET_TYPE_CSV_FILE_FORMAT;
559 } else if (type_name == "ipc") {
560 type = GADATASET_TYPE_IPC_FILE_FORMAT;
561 } else if (type_name == "parquet") {
562 type = GADATASET_TYPE_PARQUET_FILE_FORMAT;
563 }
564 return GADATASET_FILE_FORMAT(g_object_new(type,
565 "format", arrow_format,
566 NULL));
567}
568
569std::shared_ptr<arrow::dataset::FileFormat>
570gadataset_file_format_get_raw(GADatasetFileFormat *format)
571{
572 auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(format);
573 return priv->format;
574}