]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/dev/archery/archery/integration/datagen.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / dev / archery / archery / integration / datagen.py
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 from collections import namedtuple, OrderedDict
19 import binascii
20 import json
21 import os
22 import random
23 import tempfile
24
25 import numpy as np
26
27 from .util import frombytes, tobytes, random_bytes, random_utf8
28
29
30 def metadata_key_values(pairs):
31 return [{'key': k, 'value': v} for k, v in pairs]
32
33
34 class Field(object):
35
36 def __init__(self, name, *, nullable=True, metadata=None):
37 self.name = name
38 self.nullable = nullable
39 self.metadata = metadata or []
40
41 def get_json(self):
42 entries = [
43 ('name', self.name),
44 ('type', self._get_type()),
45 ('nullable', self.nullable),
46 ('children', self._get_children()),
47 ]
48
49 dct = self._get_dictionary()
50 if dct:
51 entries.append(('dictionary', dct))
52
53 if self.metadata is not None and len(self.metadata) > 0:
54 entries.append(('metadata', metadata_key_values(self.metadata)))
55
56 return OrderedDict(entries)
57
58 def _get_dictionary(self):
59 return None
60
61 def _make_is_valid(self, size, null_probability=0.4):
62 if self.nullable:
63 return (np.random.random_sample(size) > null_probability
64 ).astype(np.int8)
65 else:
66 return np.ones(size, dtype=np.int8)
67
68
69 class Column(object):
70
71 def __init__(self, name, count):
72 self.name = name
73 self.count = count
74
75 def __len__(self):
76 return self.count
77
78 def _get_children(self):
79 return []
80
81 def _get_buffers(self):
82 return []
83
84 def get_json(self):
85 entries = [
86 ('name', self.name),
87 ('count', self.count)
88 ]
89
90 buffers = self._get_buffers()
91 entries.extend(buffers)
92
93 children = self._get_children()
94 if len(children) > 0:
95 entries.append(('children', children))
96
97 return OrderedDict(entries)
98
99
100 class PrimitiveField(Field):
101
102 def _get_children(self):
103 return []
104
105
106 class PrimitiveColumn(Column):
107
108 def __init__(self, name, count, is_valid, values):
109 super().__init__(name, count)
110 self.is_valid = is_valid
111 self.values = values
112
113 def _encode_value(self, x):
114 return x
115
116 def _get_buffers(self):
117 return [
118 ('VALIDITY', [int(v) for v in self.is_valid]),
119 ('DATA', list([self._encode_value(x) for x in self.values]))
120 ]
121
122
123 class NullColumn(Column):
124 # This subclass is for readability only
125 pass
126
127
128 class NullField(PrimitiveField):
129
130 def __init__(self, name, metadata=None):
131 super().__init__(name, nullable=True,
132 metadata=metadata)
133
134 def _get_type(self):
135 return OrderedDict([('name', 'null')])
136
137 def generate_column(self, size, name=None):
138 return NullColumn(name or self.name, size)
139
140
141 TEST_INT_MAX = 2 ** 31 - 1
142 TEST_INT_MIN = ~TEST_INT_MAX
143
144
145 class IntegerField(PrimitiveField):
146
147 def __init__(self, name, is_signed, bit_width, *, nullable=True,
148 metadata=None,
149 min_value=TEST_INT_MIN,
150 max_value=TEST_INT_MAX):
151 super().__init__(name, nullable=nullable,
152 metadata=metadata)
153 self.is_signed = is_signed
154 self.bit_width = bit_width
155 self.min_value = min_value
156 self.max_value = max_value
157
158 def _get_generated_data_bounds(self):
159 if self.is_signed:
160 signed_iinfo = np.iinfo('int' + str(self.bit_width))
161 min_value, max_value = signed_iinfo.min, signed_iinfo.max
162 else:
163 unsigned_iinfo = np.iinfo('uint' + str(self.bit_width))
164 min_value, max_value = 0, unsigned_iinfo.max
165
166 lower_bound = max(min_value, self.min_value)
167 upper_bound = min(max_value, self.max_value)
168 return lower_bound, upper_bound
169
170 def _get_type(self):
171 return OrderedDict([
172 ('name', 'int'),
173 ('isSigned', self.is_signed),
174 ('bitWidth', self.bit_width)
175 ])
176
177 def generate_column(self, size, name=None):
178 lower_bound, upper_bound = self._get_generated_data_bounds()
179 return self.generate_range(size, lower_bound, upper_bound,
180 name=name, include_extremes=True)
181
182 def generate_range(self, size, lower, upper, name=None,
183 include_extremes=False):
184 values = np.random.randint(lower, upper, size=size, dtype=np.int64)
185 if include_extremes and size >= 2:
186 values[:2] = [lower, upper]
187 values = list(map(int if self.bit_width < 64 else str, values))
188
189 is_valid = self._make_is_valid(size)
190
191 if name is None:
192 name = self.name
193 return PrimitiveColumn(name, size, is_valid, values)
194
195
196 class DateField(IntegerField):
197
198 DAY = 0
199 MILLISECOND = 1
200
201 # 1/1/1 to 12/31/9999
202 _ranges = {
203 DAY: [-719162, 2932896],
204 MILLISECOND: [-62135596800000, 253402214400000]
205 }
206
207 def __init__(self, name, unit, *, nullable=True, metadata=None):
208 bit_width = 32 if unit == self.DAY else 64
209
210 min_value, max_value = self._ranges[unit]
211 super().__init__(
212 name, True, bit_width,
213 nullable=nullable, metadata=metadata,
214 min_value=min_value, max_value=max_value
215 )
216 self.unit = unit
217
218 def _get_type(self):
219 return OrderedDict([
220 ('name', 'date'),
221 ('unit', 'DAY' if self.unit == self.DAY else 'MILLISECOND')
222 ])
223
224
225 TIMEUNIT_NAMES = {
226 's': 'SECOND',
227 'ms': 'MILLISECOND',
228 'us': 'MICROSECOND',
229 'ns': 'NANOSECOND'
230 }
231
232
233 class TimeField(IntegerField):
234
235 BIT_WIDTHS = {
236 's': 32,
237 'ms': 32,
238 'us': 64,
239 'ns': 64
240 }
241
242 _ranges = {
243 's': [0, 86400],
244 'ms': [0, 86400000],
245 'us': [0, 86400000000],
246 'ns': [0, 86400000000000]
247 }
248
249 def __init__(self, name, unit='s', *, nullable=True,
250 metadata=None):
251 min_val, max_val = self._ranges[unit]
252 super().__init__(name, True, self.BIT_WIDTHS[unit],
253 nullable=nullable, metadata=metadata,
254 min_value=min_val, max_value=max_val)
255 self.unit = unit
256
257 def _get_type(self):
258 return OrderedDict([
259 ('name', 'time'),
260 ('unit', TIMEUNIT_NAMES[self.unit]),
261 ('bitWidth', self.bit_width)
262 ])
263
264
265 class TimestampField(IntegerField):
266
267 # 1/1/1 to 12/31/9999
268 _ranges = {
269 's': [-62135596800, 253402214400],
270 'ms': [-62135596800000, 253402214400000],
271 'us': [-62135596800000000, 253402214400000000],
272
273 # Physical range for int64, ~584 years and change
274 'ns': [np.iinfo('int64').min, np.iinfo('int64').max]
275 }
276
277 def __init__(self, name, unit='s', tz=None, *, nullable=True,
278 metadata=None):
279 min_val, max_val = self._ranges[unit]
280 super().__init__(name, True, 64,
281 nullable=nullable,
282 metadata=metadata,
283 min_value=min_val,
284 max_value=max_val)
285 self.unit = unit
286 self.tz = tz
287
288 def _get_type(self):
289 fields = [
290 ('name', 'timestamp'),
291 ('unit', TIMEUNIT_NAMES[self.unit])
292 ]
293
294 if self.tz is not None:
295 fields.append(('timezone', self.tz))
296
297 return OrderedDict(fields)
298
299
300 class DurationIntervalField(IntegerField):
301
302 def __init__(self, name, unit='s', *, nullable=True,
303 metadata=None):
304 min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max,
305 super().__init__(
306 name, True, 64,
307 nullable=nullable, metadata=metadata,
308 min_value=min_val, max_value=max_val)
309 self.unit = unit
310
311 def _get_type(self):
312 fields = [
313 ('name', 'duration'),
314 ('unit', TIMEUNIT_NAMES[self.unit])
315 ]
316
317 return OrderedDict(fields)
318
319
320 class YearMonthIntervalField(IntegerField):
321 def __init__(self, name, *, nullable=True, metadata=None):
322 min_val, max_val = [-10000*12, 10000*12] # +/- 10000 years.
323 super().__init__(
324 name, True, 32,
325 nullable=nullable, metadata=metadata,
326 min_value=min_val, max_value=max_val)
327
328 def _get_type(self):
329 fields = [
330 ('name', 'interval'),
331 ('unit', 'YEAR_MONTH'),
332 ]
333
334 return OrderedDict(fields)
335
336
337 class DayTimeIntervalField(PrimitiveField):
338 def __init__(self, name, *, nullable=True, metadata=None):
339 super().__init__(name,
340 nullable=True,
341 metadata=metadata)
342
343 @property
344 def numpy_type(self):
345 return object
346
347 def _get_type(self):
348
349 return OrderedDict([
350 ('name', 'interval'),
351 ('unit', 'DAY_TIME'),
352 ])
353
354 def generate_column(self, size, name=None):
355 min_day_value, max_day_value = -10000*366, 10000*366
356 values = [{'days': random.randint(min_day_value, max_day_value),
357 'milliseconds': random.randint(-86400000, +86400000)}
358 for _ in range(size)]
359
360 is_valid = self._make_is_valid(size)
361 if name is None:
362 name = self.name
363 return PrimitiveColumn(name, size, is_valid, values)
364
365
366 class MonthDayNanoIntervalField(PrimitiveField):
367 def __init__(self, name, *, nullable=True, metadata=None):
368 super().__init__(name,
369 nullable=True,
370 metadata=metadata)
371
372 @property
373 def numpy_type(self):
374 return object
375
376 def _get_type(self):
377
378 return OrderedDict([
379 ('name', 'interval'),
380 ('unit', 'MONTH_DAY_NANO'),
381 ])
382
383 def generate_column(self, size, name=None):
384 I32 = 'int32'
385 min_int_value, max_int_value = np.iinfo(I32).min, np.iinfo(I32).max
386 I64 = 'int64'
387 min_nano_val, max_nano_val = np.iinfo(I64).min, np.iinfo(I64).max,
388 values = [{'months': random.randint(min_int_value, max_int_value),
389 'days': random.randint(min_int_value, max_int_value),
390 'nanoseconds': random.randint(min_nano_val, max_nano_val)}
391 for _ in range(size)]
392
393 is_valid = self._make_is_valid(size)
394 if name is None:
395 name = self.name
396 return PrimitiveColumn(name, size, is_valid, values)
397
398
399 class FloatingPointField(PrimitiveField):
400
401 def __init__(self, name, bit_width, *, nullable=True,
402 metadata=None):
403 super().__init__(name,
404 nullable=nullable,
405 metadata=metadata)
406
407 self.bit_width = bit_width
408 self.precision = {
409 16: 'HALF',
410 32: 'SINGLE',
411 64: 'DOUBLE'
412 }[self.bit_width]
413
414 @property
415 def numpy_type(self):
416 return 'float' + str(self.bit_width)
417
418 def _get_type(self):
419 return OrderedDict([
420 ('name', 'floatingpoint'),
421 ('precision', self.precision)
422 ])
423
424 def generate_column(self, size, name=None):
425 values = np.random.randn(size) * 1000
426 values = np.round(values, 3)
427
428 is_valid = self._make_is_valid(size)
429 if name is None:
430 name = self.name
431 return PrimitiveColumn(name, size, is_valid, values)
432
433
434 DECIMAL_PRECISION_TO_VALUE = {
435 key: (1 << (8 * i - 1)) - 1 for i, key in enumerate(
436 [1, 3, 5, 7, 10, 12, 15, 17, 19, 22, 24, 27, 29, 32, 34, 36,
437 40, 42, 44, 50, 60, 70],
438 start=1,
439 )
440 }
441
442
443 def decimal_range_from_precision(precision):
444 assert 1 <= precision <= 76
445 try:
446 max_value = DECIMAL_PRECISION_TO_VALUE[precision]
447 except KeyError:
448 return decimal_range_from_precision(precision - 1)
449 else:
450 return ~max_value, max_value
451
452
453 class DecimalField(PrimitiveField):
454 def __init__(self, name, precision, scale, bit_width, *,
455 nullable=True, metadata=None):
456 super().__init__(name, nullable=True,
457 metadata=metadata)
458 self.precision = precision
459 self.scale = scale
460 self.bit_width = bit_width
461
462 @property
463 def numpy_type(self):
464 return object
465
466 def _get_type(self):
467 return OrderedDict([
468 ('name', 'decimal'),
469 ('precision', self.precision),
470 ('scale', self.scale),
471 ('bitWidth', self.bit_width),
472 ])
473
474 def generate_column(self, size, name=None):
475 min_value, max_value = decimal_range_from_precision(self.precision)
476 values = [random.randint(min_value, max_value) for _ in range(size)]
477
478 is_valid = self._make_is_valid(size)
479 if name is None:
480 name = self.name
481 return DecimalColumn(name, size, is_valid, values, self.bit_width)
482
483
484 class DecimalColumn(PrimitiveColumn):
485
486 def __init__(self, name, count, is_valid, values, bit_width):
487 super().__init__(name, count, is_valid, values)
488 self.bit_width = bit_width
489
490 def _encode_value(self, x):
491 return str(x)
492
493
494 class BooleanField(PrimitiveField):
495 bit_width = 1
496
497 def _get_type(self):
498 return OrderedDict([('name', 'bool')])
499
500 @property
501 def numpy_type(self):
502 return 'bool'
503
504 def generate_column(self, size, name=None):
505 values = list(map(bool, np.random.randint(0, 2, size=size)))
506 is_valid = self._make_is_valid(size)
507 if name is None:
508 name = self.name
509 return PrimitiveColumn(name, size, is_valid, values)
510
511
512 class FixedSizeBinaryField(PrimitiveField):
513
514 def __init__(self, name, byte_width, *, nullable=True,
515 metadata=None):
516 super().__init__(name, nullable=nullable,
517 metadata=metadata)
518 self.byte_width = byte_width
519
520 @property
521 def numpy_type(self):
522 return object
523
524 @property
525 def column_class(self):
526 return FixedSizeBinaryColumn
527
528 def _get_type(self):
529 return OrderedDict([('name', 'fixedsizebinary'),
530 ('byteWidth', self.byte_width)])
531
532 def generate_column(self, size, name=None):
533 is_valid = self._make_is_valid(size)
534 values = []
535
536 for i in range(size):
537 values.append(random_bytes(self.byte_width))
538
539 if name is None:
540 name = self.name
541 return self.column_class(name, size, is_valid, values)
542
543
544 class BinaryField(PrimitiveField):
545
546 @property
547 def numpy_type(self):
548 return object
549
550 @property
551 def column_class(self):
552 return BinaryColumn
553
554 def _get_type(self):
555 return OrderedDict([('name', 'binary')])
556
557 def _random_sizes(self, size):
558 return np.random.exponential(scale=4, size=size).astype(np.int32)
559
560 def generate_column(self, size, name=None):
561 is_valid = self._make_is_valid(size)
562 values = []
563
564 sizes = self._random_sizes(size)
565
566 for i, nbytes in enumerate(sizes):
567 if is_valid[i]:
568 values.append(random_bytes(nbytes))
569 else:
570 values.append(b"")
571
572 if name is None:
573 name = self.name
574 return self.column_class(name, size, is_valid, values)
575
576
577 class StringField(BinaryField):
578
579 @property
580 def column_class(self):
581 return StringColumn
582
583 def _get_type(self):
584 return OrderedDict([('name', 'utf8')])
585
586 def generate_column(self, size, name=None):
587 K = 7
588 is_valid = self._make_is_valid(size)
589 values = []
590
591 for i in range(size):
592 if is_valid[i]:
593 values.append(tobytes(random_utf8(K)))
594 else:
595 values.append(b"")
596
597 if name is None:
598 name = self.name
599 return self.column_class(name, size, is_valid, values)
600
601
602 class LargeBinaryField(BinaryField):
603
604 @property
605 def column_class(self):
606 return LargeBinaryColumn
607
608 def _get_type(self):
609 return OrderedDict([('name', 'largebinary')])
610
611
612 class LargeStringField(StringField):
613
614 @property
615 def column_class(self):
616 return LargeStringColumn
617
618 def _get_type(self):
619 return OrderedDict([('name', 'largeutf8')])
620
621
622 class Schema(object):
623
624 def __init__(self, fields, metadata=None):
625 self.fields = fields
626 self.metadata = metadata
627
628 def get_json(self):
629 entries = [
630 ('fields', [field.get_json() for field in self.fields])
631 ]
632
633 if self.metadata is not None and len(self.metadata) > 0:
634 entries.append(('metadata', metadata_key_values(self.metadata)))
635
636 return OrderedDict(entries)
637
638
639 class _NarrowOffsetsMixin:
640
641 def _encode_offsets(self, offsets):
642 return list(map(int, offsets))
643
644
645 class _LargeOffsetsMixin:
646
647 def _encode_offsets(self, offsets):
648 # 64-bit offsets have to be represented as strings to roundtrip
649 # through JSON.
650 return list(map(str, offsets))
651
652
653 class _BaseBinaryColumn(PrimitiveColumn):
654
655 def _encode_value(self, x):
656 return frombytes(binascii.hexlify(x).upper())
657
658 def _get_buffers(self):
659 offset = 0
660 offsets = [0]
661
662 data = []
663 for i, v in enumerate(self.values):
664 if self.is_valid[i]:
665 offset += len(v)
666 else:
667 v = b""
668
669 offsets.append(offset)
670 data.append(self._encode_value(v))
671
672 return [
673 ('VALIDITY', [int(x) for x in self.is_valid]),
674 ('OFFSET', self._encode_offsets(offsets)),
675 ('DATA', data)
676 ]
677
678
679 class _BaseStringColumn(_BaseBinaryColumn):
680
681 def _encode_value(self, x):
682 return frombytes(x)
683
684
685 class BinaryColumn(_BaseBinaryColumn, _NarrowOffsetsMixin):
686 pass
687
688
689 class StringColumn(_BaseStringColumn, _NarrowOffsetsMixin):
690 pass
691
692
693 class LargeBinaryColumn(_BaseBinaryColumn, _LargeOffsetsMixin):
694 pass
695
696
697 class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
698 pass
699
700
701 class FixedSizeBinaryColumn(PrimitiveColumn):
702
703 def _encode_value(self, x):
704 return frombytes(binascii.hexlify(x).upper())
705
706 def _get_buffers(self):
707 data = []
708 for i, v in enumerate(self.values):
709 data.append(self._encode_value(v))
710
711 return [
712 ('VALIDITY', [int(x) for x in self.is_valid]),
713 ('DATA', data)
714 ]
715
716
717 class ListField(Field):
718
719 def __init__(self, name, value_field, *, nullable=True,
720 metadata=None):
721 super().__init__(name, nullable=nullable,
722 metadata=metadata)
723 self.value_field = value_field
724
725 @property
726 def column_class(self):
727 return ListColumn
728
729 def _get_type(self):
730 return OrderedDict([
731 ('name', 'list')
732 ])
733
734 def _get_children(self):
735 return [self.value_field.get_json()]
736
737 def generate_column(self, size, name=None):
738 MAX_LIST_SIZE = 4
739
740 is_valid = self._make_is_valid(size)
741 list_sizes = np.random.randint(0, MAX_LIST_SIZE + 1, size=size)
742 offsets = [0]
743
744 offset = 0
745 for i in range(size):
746 if is_valid[i]:
747 offset += int(list_sizes[i])
748 offsets.append(offset)
749
750 # The offset now is the total number of elements in the child array
751 values = self.value_field.generate_column(offset)
752
753 if name is None:
754 name = self.name
755 return self.column_class(name, size, is_valid, offsets, values)
756
757
758 class LargeListField(ListField):
759
760 @property
761 def column_class(self):
762 return LargeListColumn
763
764 def _get_type(self):
765 return OrderedDict([
766 ('name', 'largelist')
767 ])
768
769
770 class _BaseListColumn(Column):
771
772 def __init__(self, name, count, is_valid, offsets, values):
773 super().__init__(name, count)
774 self.is_valid = is_valid
775 self.offsets = offsets
776 self.values = values
777
778 def _get_buffers(self):
779 return [
780 ('VALIDITY', [int(v) for v in self.is_valid]),
781 ('OFFSET', self._encode_offsets(self.offsets))
782 ]
783
784 def _get_children(self):
785 return [self.values.get_json()]
786
787
788 class ListColumn(_BaseListColumn, _NarrowOffsetsMixin):
789 pass
790
791
792 class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
793 pass
794
795
796 class MapField(Field):
797
798 def __init__(self, name, key_field, item_field, *, nullable=True,
799 metadata=None, keys_sorted=False, entries_name='entries'):
800 super().__init__(name, nullable=nullable,
801 metadata=metadata)
802
803 assert not key_field.nullable
804 self.key_field = key_field
805 self.item_field = item_field
806 self.pair_field = StructField(entries_name, [key_field, item_field],
807 nullable=False)
808 self.keys_sorted = keys_sorted
809
810 def _get_type(self):
811 return OrderedDict([
812 ('name', 'map'),
813 ('keysSorted', self.keys_sorted)
814 ])
815
816 def _get_children(self):
817 return [self.pair_field.get_json()]
818
819 def generate_column(self, size, name=None):
820 MAX_MAP_SIZE = 4
821
822 is_valid = self._make_is_valid(size)
823 map_sizes = np.random.randint(0, MAX_MAP_SIZE + 1, size=size)
824 offsets = [0]
825
826 offset = 0
827 for i in range(size):
828 if is_valid[i]:
829 offset += int(map_sizes[i])
830 offsets.append(offset)
831
832 # The offset now is the total number of elements in the child array
833 pairs = self.pair_field.generate_column(offset)
834 if name is None:
835 name = self.name
836
837 return MapColumn(name, size, is_valid, offsets, pairs)
838
839
840 class MapColumn(Column):
841
842 def __init__(self, name, count, is_valid, offsets, pairs):
843 super().__init__(name, count)
844 self.is_valid = is_valid
845 self.offsets = offsets
846 self.pairs = pairs
847
848 def _get_buffers(self):
849 return [
850 ('VALIDITY', [int(v) for v in self.is_valid]),
851 ('OFFSET', list(self.offsets))
852 ]
853
854 def _get_children(self):
855 return [self.pairs.get_json()]
856
857
858 class FixedSizeListField(Field):
859
860 def __init__(self, name, value_field, list_size, *, nullable=True,
861 metadata=None):
862 super().__init__(name, nullable=nullable,
863 metadata=metadata)
864 self.value_field = value_field
865 self.list_size = list_size
866
867 def _get_type(self):
868 return OrderedDict([
869 ('name', 'fixedsizelist'),
870 ('listSize', self.list_size)
871 ])
872
873 def _get_children(self):
874 return [self.value_field.get_json()]
875
876 def generate_column(self, size, name=None):
877 is_valid = self._make_is_valid(size)
878 values = self.value_field.generate_column(size * self.list_size)
879
880 if name is None:
881 name = self.name
882 return FixedSizeListColumn(name, size, is_valid, values)
883
884
885 class FixedSizeListColumn(Column):
886
887 def __init__(self, name, count, is_valid, values):
888 super().__init__(name, count)
889 self.is_valid = is_valid
890 self.values = values
891
892 def _get_buffers(self):
893 return [
894 ('VALIDITY', [int(v) for v in self.is_valid])
895 ]
896
897 def _get_children(self):
898 return [self.values.get_json()]
899
900
901 class StructField(Field):
902
903 def __init__(self, name, fields, *, nullable=True,
904 metadata=None):
905 super().__init__(name, nullable=nullable,
906 metadata=metadata)
907 self.fields = fields
908
909 def _get_type(self):
910 return OrderedDict([
911 ('name', 'struct')
912 ])
913
914 def _get_children(self):
915 return [field.get_json() for field in self.fields]
916
917 def generate_column(self, size, name=None):
918 is_valid = self._make_is_valid(size)
919
920 field_values = [field.generate_column(size) for field in self.fields]
921 if name is None:
922 name = self.name
923 return StructColumn(name, size, is_valid, field_values)
924
925
926 class _BaseUnionField(Field):
927
928 def __init__(self, name, fields, type_ids=None, *, nullable=True,
929 metadata=None):
930 super().__init__(name, nullable=nullable, metadata=metadata)
931 if type_ids is None:
932 type_ids = list(range(fields))
933 else:
934 assert len(fields) == len(type_ids)
935 self.fields = fields
936 self.type_ids = type_ids
937 assert all(x >= 0 for x in self.type_ids)
938
939 def _get_type(self):
940 return OrderedDict([
941 ('name', 'union'),
942 ('mode', self.mode),
943 ('typeIds', self.type_ids),
944 ])
945
946 def _get_children(self):
947 return [field.get_json() for field in self.fields]
948
949 def _make_type_ids(self, size):
950 return np.random.choice(self.type_ids, size)
951
952
953 class SparseUnionField(_BaseUnionField):
954 mode = 'SPARSE'
955
956 def generate_column(self, size, name=None):
957 array_type_ids = self._make_type_ids(size)
958 field_values = [field.generate_column(size) for field in self.fields]
959
960 if name is None:
961 name = self.name
962 return SparseUnionColumn(name, size, array_type_ids, field_values)
963
964
965 class DenseUnionField(_BaseUnionField):
966 mode = 'DENSE'
967
968 def generate_column(self, size, name=None):
969 # Reverse mapping {logical type id => physical child id}
970 child_ids = [None] * (max(self.type_ids) + 1)
971 for i, type_id in enumerate(self.type_ids):
972 child_ids[type_id] = i
973
974 array_type_ids = self._make_type_ids(size)
975 offsets = []
976 child_sizes = [0] * len(self.fields)
977
978 for i in range(size):
979 child_id = child_ids[array_type_ids[i]]
980 offset = child_sizes[child_id]
981 offsets.append(offset)
982 child_sizes[child_id] = offset + 1
983
984 field_values = [
985 field.generate_column(child_size)
986 for field, child_size in zip(self.fields, child_sizes)]
987
988 if name is None:
989 name = self.name
990 return DenseUnionColumn(name, size, array_type_ids, offsets,
991 field_values)
992
993
994 class Dictionary(object):
995
996 def __init__(self, id_, field, size, name=None, ordered=False):
997 self.id_ = id_
998 self.field = field
999 self.values = field.generate_column(size=size, name=name)
1000 self.ordered = ordered
1001
1002 def __len__(self):
1003 return len(self.values)
1004
1005 def get_json(self):
1006 dummy_batch = RecordBatch(len(self.values), [self.values])
1007 return OrderedDict([
1008 ('id', self.id_),
1009 ('data', dummy_batch.get_json())
1010 ])
1011
1012
1013 class DictionaryField(Field):
1014
1015 def __init__(self, name, index_field, dictionary, *, nullable=True,
1016 metadata=None):
1017 super().__init__(name, nullable=nullable,
1018 metadata=metadata)
1019 assert index_field.name == ''
1020 assert isinstance(index_field, IntegerField)
1021 assert isinstance(dictionary, Dictionary)
1022
1023 self.index_field = index_field
1024 self.dictionary = dictionary
1025
1026 def _get_type(self):
1027 return self.dictionary.field._get_type()
1028
1029 def _get_children(self):
1030 return self.dictionary.field._get_children()
1031
1032 def _get_dictionary(self):
1033 return OrderedDict([
1034 ('id', self.dictionary.id_),
1035 ('indexType', self.index_field._get_type()),
1036 ('isOrdered', self.dictionary.ordered)
1037 ])
1038
1039 def generate_column(self, size, name=None):
1040 if name is None:
1041 name = self.name
1042 return self.index_field.generate_range(size, 0, len(self.dictionary),
1043 name=name)
1044
1045
1046 ExtensionType = namedtuple(
1047 'ExtensionType', ['extension_name', 'serialized', 'storage_field'])
1048
1049
1050 class ExtensionField(Field):
1051
1052 def __init__(self, name, extension_type, *, nullable=True, metadata=None):
1053 metadata = (metadata or []) + [
1054 ('ARROW:extension:name', extension_type.extension_name),
1055 ('ARROW:extension:metadata', extension_type.serialized),
1056 ]
1057 super().__init__(name, nullable=nullable, metadata=metadata)
1058 self.extension_type = extension_type
1059
1060 def _get_type(self):
1061 return self.extension_type.storage_field._get_type()
1062
1063 def _get_children(self):
1064 return self.extension_type.storage_field._get_children()
1065
1066 def _get_dictionary(self):
1067 return self.extension_type.storage_field._get_dictionary()
1068
1069 def generate_column(self, size, name=None):
1070 if name is None:
1071 name = self.name
1072 return self.extension_type.storage_field.generate_column(size, name)
1073
1074
1075 class StructColumn(Column):
1076
1077 def __init__(self, name, count, is_valid, field_values):
1078 super().__init__(name, count)
1079 self.is_valid = is_valid
1080 self.field_values = field_values
1081
1082 def _get_buffers(self):
1083 return [
1084 ('VALIDITY', [int(v) for v in self.is_valid])
1085 ]
1086
1087 def _get_children(self):
1088 return [field.get_json() for field in self.field_values]
1089
1090
1091 class SparseUnionColumn(Column):
1092
1093 def __init__(self, name, count, type_ids, field_values):
1094 super().__init__(name, count)
1095 self.type_ids = type_ids
1096 self.field_values = field_values
1097
1098 def _get_buffers(self):
1099 return [
1100 ('TYPE_ID', [int(v) for v in self.type_ids])
1101 ]
1102
1103 def _get_children(self):
1104 return [field.get_json() for field in self.field_values]
1105
1106
1107 class DenseUnionColumn(Column):
1108
1109 def __init__(self, name, count, type_ids, offsets, field_values):
1110 super().__init__(name, count)
1111 self.type_ids = type_ids
1112 self.offsets = offsets
1113 self.field_values = field_values
1114
1115 def _get_buffers(self):
1116 return [
1117 ('TYPE_ID', [int(v) for v in self.type_ids]),
1118 ('OFFSET', [int(v) for v in self.offsets]),
1119 ]
1120
1121 def _get_children(self):
1122 return [field.get_json() for field in self.field_values]
1123
1124
1125 class RecordBatch(object):
1126
1127 def __init__(self, count, columns):
1128 self.count = count
1129 self.columns = columns
1130
1131 def get_json(self):
1132 return OrderedDict([
1133 ('count', self.count),
1134 ('columns', [col.get_json() for col in self.columns])
1135 ])
1136
1137
1138 class File(object):
1139
1140 def __init__(self, name, schema, batches, dictionaries=None,
1141 skip=None, path=None):
1142 self.name = name
1143 self.schema = schema
1144 self.dictionaries = dictionaries or []
1145 self.batches = batches
1146 self.skip = set()
1147 self.path = path
1148 if skip:
1149 self.skip.update(skip)
1150
1151 def get_json(self):
1152 entries = [
1153 ('schema', self.schema.get_json())
1154 ]
1155
1156 if len(self.dictionaries) > 0:
1157 entries.append(('dictionaries',
1158 [dictionary.get_json()
1159 for dictionary in self.dictionaries]))
1160
1161 entries.append(('batches', [batch.get_json()
1162 for batch in self.batches]))
1163 return OrderedDict(entries)
1164
1165 def write(self, path):
1166 with open(path, 'wb') as f:
1167 f.write(json.dumps(self.get_json(), indent=2).encode('utf-8'))
1168 self.path = path
1169
1170 def skip_category(self, category):
1171 """Skip this test for the given category.
1172
1173 Category should be SKIP_ARROW or SKIP_FLIGHT.
1174 """
1175 self.skip.add(category)
1176 return self
1177
1178
1179 def get_field(name, type_, **kwargs):
1180 if type_ == 'binary':
1181 return BinaryField(name, **kwargs)
1182 elif type_ == 'utf8':
1183 return StringField(name, **kwargs)
1184 elif type_ == 'largebinary':
1185 return LargeBinaryField(name, **kwargs)
1186 elif type_ == 'largeutf8':
1187 return LargeStringField(name, **kwargs)
1188 elif type_.startswith('fixedsizebinary_'):
1189 byte_width = int(type_.split('_')[1])
1190 return FixedSizeBinaryField(name, byte_width=byte_width, **kwargs)
1191
1192 dtype = np.dtype(type_)
1193
1194 if dtype.kind in ('i', 'u'):
1195 signed = dtype.kind == 'i'
1196 bit_width = dtype.itemsize * 8
1197 return IntegerField(name, signed, bit_width, **kwargs)
1198 elif dtype.kind == 'f':
1199 bit_width = dtype.itemsize * 8
1200 return FloatingPointField(name, bit_width, **kwargs)
1201 elif dtype.kind == 'b':
1202 return BooleanField(name, **kwargs)
1203 else:
1204 raise TypeError(dtype)
1205
1206
1207 def _generate_file(name, fields, batch_sizes, dictionaries=None, skip=None,
1208 metadata=None):
1209 schema = Schema(fields, metadata=metadata)
1210 batches = []
1211 for size in batch_sizes:
1212 columns = []
1213 for field in fields:
1214 col = field.generate_column(size)
1215 columns.append(col)
1216
1217 batches.append(RecordBatch(size, columns))
1218
1219 return File(name, schema, batches, dictionaries, skip=skip)
1220
1221
1222 def generate_custom_metadata_case():
1223 def meta(items):
1224 # Generate a simple block of metadata where each value is '{}'.
1225 # Keys are delimited by whitespace in `items`.
1226 return [(k, '{}') for k in items.split()]
1227
1228 fields = [
1229 get_field('sort_of_pandas', 'int8', metadata=meta('pandas')),
1230
1231 get_field('lots_of_meta', 'int8', metadata=meta('a b c d .. w x y z')),
1232
1233 get_field(
1234 'unregistered_extension', 'int8',
1235 metadata=[
1236 ('ARROW:extension:name', '!nonexistent'),
1237 ('ARROW:extension:metadata', ''),
1238 ('ARROW:integration:allow_unregistered_extension', 'true'),
1239 ]),
1240
1241 ListField('list_with_odd_values',
1242 get_field('item', 'int32', metadata=meta('odd_values'))),
1243 ]
1244
1245 batch_sizes = [1]
1246 return _generate_file('custom_metadata', fields, batch_sizes,
1247 metadata=meta('schema_custom_0 schema_custom_1'))
1248
1249
1250 def generate_duplicate_fieldnames_case():
1251 fields = [
1252 get_field('ints', 'int8'),
1253 get_field('ints', 'int32'),
1254
1255 StructField('struct', [get_field('', 'int32'), get_field('', 'utf8')]),
1256 ]
1257
1258 batch_sizes = [1]
1259 return _generate_file('duplicate_fieldnames', fields, batch_sizes)
1260
1261
1262 def generate_primitive_case(batch_sizes, name='primitive'):
1263 types = ['bool', 'int8', 'int16', 'int32', 'int64',
1264 'uint8', 'uint16', 'uint32', 'uint64',
1265 'float32', 'float64', 'binary', 'utf8',
1266 'fixedsizebinary_19', 'fixedsizebinary_120']
1267
1268 fields = []
1269
1270 for type_ in types:
1271 fields.append(get_field(type_ + "_nullable", type_, nullable=True))
1272 fields.append(get_field(type_ + "_nonnullable", type_, nullable=False))
1273
1274 return _generate_file(name, fields, batch_sizes)
1275
1276
1277 def generate_primitive_large_offsets_case(batch_sizes):
1278 types = ['largebinary', 'largeutf8']
1279
1280 fields = []
1281
1282 for type_ in types:
1283 fields.append(get_field(type_ + "_nullable", type_, nullable=True))
1284 fields.append(get_field(type_ + "_nonnullable", type_, nullable=False))
1285
1286 return _generate_file('primitive_large_offsets', fields, batch_sizes)
1287
1288
1289 def generate_null_case(batch_sizes):
1290 # Interleave null with non-null types to ensure the appropriate number of
1291 # buffers (0) is read and written
1292 fields = [
1293 NullField(name='f0'),
1294 get_field('f1', 'int32'),
1295 NullField(name='f2'),
1296 get_field('f3', 'float64'),
1297 NullField(name='f4')
1298 ]
1299 return _generate_file('null', fields, batch_sizes)
1300
1301
1302 def generate_null_trivial_case(batch_sizes):
1303 # Generate a case with no buffers
1304 fields = [
1305 NullField(name='f0'),
1306 ]
1307 return _generate_file('null_trivial', fields, batch_sizes)
1308
1309
1310 def generate_decimal128_case():
1311 fields = [
1312 DecimalField(name='f{}'.format(i), precision=precision, scale=2,
1313 bit_width=128)
1314 for i, precision in enumerate(range(3, 39))
1315 ]
1316
1317 possible_batch_sizes = 7, 10
1318 batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))]
1319 # 'decimal' is the original name for the test, and it must match
1320 # provide "gold" files that test backwards compatibility, so they
1321 # can be appropriately skipped.
1322 return _generate_file('decimal', fields, batch_sizes)
1323
1324
1325 def generate_decimal256_case():
1326 fields = [
1327 DecimalField(name='f{}'.format(i), precision=precision, scale=5,
1328 bit_width=256)
1329 for i, precision in enumerate(range(37, 70))
1330 ]
1331
1332 possible_batch_sizes = 7, 10
1333 batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))]
1334 return _generate_file('decimal256', fields, batch_sizes)
1335
1336
1337 def generate_datetime_case():
1338 fields = [
1339 DateField('f0', DateField.DAY),
1340 DateField('f1', DateField.MILLISECOND),
1341 TimeField('f2', 's'),
1342 TimeField('f3', 'ms'),
1343 TimeField('f4', 'us'),
1344 TimeField('f5', 'ns'),
1345 TimestampField('f6', 's'),
1346 TimestampField('f7', 'ms'),
1347 TimestampField('f8', 'us'),
1348 TimestampField('f9', 'ns'),
1349 TimestampField('f10', 'ms', tz=None),
1350 TimestampField('f11', 's', tz='UTC'),
1351 TimestampField('f12', 'ms', tz='US/Eastern'),
1352 TimestampField('f13', 'us', tz='Europe/Paris'),
1353 TimestampField('f14', 'ns', tz='US/Pacific'),
1354 ]
1355
1356 batch_sizes = [7, 10]
1357 return _generate_file("datetime", fields, batch_sizes)
1358
1359
1360 def generate_interval_case():
1361 fields = [
1362 DurationIntervalField('f1', 's'),
1363 DurationIntervalField('f2', 'ms'),
1364 DurationIntervalField('f3', 'us'),
1365 DurationIntervalField('f4', 'ns'),
1366 YearMonthIntervalField('f5'),
1367 DayTimeIntervalField('f6'),
1368 ]
1369
1370 batch_sizes = [7, 10]
1371 return _generate_file("interval", fields, batch_sizes)
1372
1373
1374 def generate_month_day_nano_interval_case():
1375 fields = [
1376 MonthDayNanoIntervalField('f1'),
1377 ]
1378
1379 batch_sizes = [7, 10]
1380 return _generate_file("interval_mdn", fields, batch_sizes)
1381
1382
1383 def generate_map_case():
1384 fields = [
1385 MapField('map_nullable', get_field('key', 'utf8', nullable=False),
1386 get_field('value', 'int32')),
1387 ]
1388
1389 batch_sizes = [7, 10]
1390 return _generate_file("map", fields, batch_sizes)
1391
1392
1393 def generate_non_canonical_map_case():
1394 fields = [
1395 MapField('map_other_names',
1396 get_field('some_key', 'utf8', nullable=False),
1397 get_field('some_value', 'int32'),
1398 entries_name='some_entries'),
1399 ]
1400
1401 batch_sizes = [7]
1402 return _generate_file("map_non_canonical", fields, batch_sizes)
1403
1404
1405 def generate_nested_case():
1406 fields = [
1407 ListField('list_nullable', get_field('item', 'int32')),
1408 FixedSizeListField('fixedsizelist_nullable',
1409 get_field('item', 'int32'), 4),
1410 StructField('struct_nullable', [get_field('f1', 'int32'),
1411 get_field('f2', 'utf8')]),
1412 # Fails on Go (ARROW-8452)
1413 # ListField('list_nonnullable', get_field('item', 'int32'),
1414 # nullable=False),
1415 ]
1416
1417 batch_sizes = [7, 10]
1418 return _generate_file("nested", fields, batch_sizes)
1419
1420
1421 def generate_recursive_nested_case():
1422 fields = [
1423 ListField('lists_list',
1424 ListField('inner_list', get_field('item', 'int16'))),
1425 ListField('structs_list',
1426 StructField('inner_struct',
1427 [get_field('f1', 'int32'),
1428 get_field('f2', 'utf8')])),
1429 ]
1430
1431 batch_sizes = [7, 10]
1432 return _generate_file("recursive_nested", fields, batch_sizes)
1433
1434
1435 def generate_nested_large_offsets_case():
1436 fields = [
1437 LargeListField('large_list_nullable', get_field('item', 'int32')),
1438 LargeListField('large_list_nonnullable',
1439 get_field('item', 'int32'), nullable=False),
1440 LargeListField('large_list_nested',
1441 ListField('inner_list', get_field('item', 'int16'))),
1442 ]
1443
1444 batch_sizes = [0, 13]
1445 return _generate_file("nested_large_offsets", fields, batch_sizes)
1446
1447
1448 def generate_unions_case():
1449 fields = [
1450 SparseUnionField('sparse', [get_field('f1', 'int32'),
1451 get_field('f2', 'utf8')],
1452 type_ids=[5, 7]),
1453 DenseUnionField('dense', [get_field('f1', 'int16'),
1454 get_field('f2', 'binary')],
1455 type_ids=[10, 20]),
1456 SparseUnionField('sparse', [get_field('f1', 'float32', nullable=False),
1457 get_field('f2', 'bool')],
1458 type_ids=[5, 7], nullable=False),
1459 DenseUnionField('dense', [get_field('f1', 'uint8', nullable=False),
1460 get_field('f2', 'uint16'),
1461 NullField('f3')],
1462 type_ids=[42, 43, 44], nullable=False),
1463 ]
1464
1465 batch_sizes = [0, 11]
1466 return _generate_file("union", fields, batch_sizes)
1467
1468
1469 def generate_dictionary_case():
1470 dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0')
1471 dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1')
1472 dict2 = Dictionary(2, get_field('dictionary2', 'int64'),
1473 size=50, name='DICT2')
1474
1475 fields = [
1476 DictionaryField('dict0', get_field('', 'int8'), dict0),
1477 DictionaryField('dict1', get_field('', 'int32'), dict1),
1478 DictionaryField('dict2', get_field('', 'int16'), dict2)
1479 ]
1480 batch_sizes = [7, 10]
1481 return _generate_file("dictionary", fields, batch_sizes,
1482 dictionaries=[dict0, dict1, dict2])
1483
1484
1485 def generate_dictionary_unsigned_case():
1486 dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0')
1487 dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1')
1488 dict2 = Dictionary(2, StringField('dictionary2'), size=5, name='DICT2')
1489
1490 # TODO: JavaScript does not support uint64 dictionary indices, so disabled
1491 # for now
1492
1493 # dict3 = Dictionary(3, StringField('dictionary3'), size=5, name='DICT3')
1494 fields = [
1495 DictionaryField('f0', get_field('', 'uint8'), dict0),
1496 DictionaryField('f1', get_field('', 'uint16'), dict1),
1497 DictionaryField('f2', get_field('', 'uint32'), dict2),
1498 # DictionaryField('f3', get_field('', 'uint64'), dict3)
1499 ]
1500 batch_sizes = [7, 10]
1501 return _generate_file("dictionary_unsigned", fields, batch_sizes,
1502 dictionaries=[dict0, dict1, dict2])
1503
1504
1505 def generate_nested_dictionary_case():
1506 dict0 = Dictionary(0, StringField('str'), size=10, name='DICT0')
1507
1508 list_of_dict = ListField(
1509 'list',
1510 DictionaryField('str_dict', get_field('', 'int8'), dict0))
1511 dict1 = Dictionary(1, list_of_dict, size=30, name='DICT1')
1512
1513 struct_of_dict = StructField('struct', [
1514 DictionaryField('str_dict_a', get_field('', 'int8'), dict0),
1515 DictionaryField('str_dict_b', get_field('', 'int8'), dict0)
1516 ])
1517 dict2 = Dictionary(2, struct_of_dict, size=30, name='DICT2')
1518
1519 fields = [
1520 DictionaryField('list_dict', get_field('', 'int8'), dict1),
1521 DictionaryField('struct_dict', get_field('', 'int8'), dict2)
1522 ]
1523
1524 batch_sizes = [10, 13]
1525 return _generate_file("nested_dictionary", fields, batch_sizes,
1526 dictionaries=[dict0, dict1, dict2])
1527
1528
1529 def generate_extension_case():
1530 dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0')
1531
1532 uuid_type = ExtensionType('uuid', 'uuid-serialized',
1533 FixedSizeBinaryField('', 16))
1534 dict_ext_type = ExtensionType(
1535 'dict-extension', 'dict-extension-serialized',
1536 DictionaryField('str_dict', get_field('', 'int8'), dict0))
1537
1538 fields = [
1539 ExtensionField('uuids', uuid_type),
1540 ExtensionField('dict_exts', dict_ext_type),
1541 ]
1542
1543 batch_sizes = [0, 13]
1544 return _generate_file("extension", fields, batch_sizes,
1545 dictionaries=[dict0])
1546
1547
1548 def get_generated_json_files(tempdir=None):
1549 tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
1550
1551 def _temp_path():
1552 return
1553
1554 file_objs = [
1555 generate_primitive_case([], name='primitive_no_batches'),
1556 generate_primitive_case([17, 20], name='primitive'),
1557 generate_primitive_case([0, 0, 0], name='primitive_zerolength'),
1558
1559 generate_primitive_large_offsets_case([17, 20])
1560 .skip_category('C#')
1561 .skip_category('Go')
1562 .skip_category('JS'),
1563
1564 generate_null_case([10, 0])
1565 .skip_category('C#')
1566 .skip_category('JS'), # TODO(ARROW-7900)
1567
1568 generate_null_trivial_case([0, 0])
1569 .skip_category('C#')
1570 .skip_category('JS'), # TODO(ARROW-7900)
1571
1572 generate_decimal128_case()
1573 .skip_category('Rust'),
1574
1575 generate_decimal256_case()
1576 .skip_category('Go') # TODO(ARROW-7948): Decimal + Go
1577 .skip_category('JS')
1578 .skip_category('Rust'),
1579
1580 generate_datetime_case()
1581 .skip_category('C#'),
1582
1583 generate_interval_case()
1584 .skip_category('C#')
1585 .skip_category('JS') # TODO(ARROW-5239): Intervals + JS
1586 .skip_category('Rust'),
1587
1588 generate_month_day_nano_interval_case()
1589 .skip_category('C#')
1590 .skip_category('JS')
1591 .skip_category('Rust'),
1592
1593
1594 generate_map_case()
1595 .skip_category('C#')
1596 .skip_category('Rust'),
1597
1598 generate_non_canonical_map_case()
1599 .skip_category('C#')
1600 .skip_category('Java') # TODO(ARROW-8715)
1601 .skip_category('JS') # TODO(ARROW-8716)
1602 .skip_category('Rust'),
1603
1604 generate_nested_case()
1605 .skip_category('C#'),
1606
1607 generate_recursive_nested_case()
1608 .skip_category('C#'),
1609
1610 generate_nested_large_offsets_case()
1611 .skip_category('C#')
1612 .skip_category('Go')
1613 .skip_category('JS')
1614 .skip_category('Rust'),
1615
1616 generate_unions_case()
1617 .skip_category('C#')
1618 .skip_category('Go')
1619 .skip_category('JS')
1620 .skip_category('Rust'),
1621
1622 generate_custom_metadata_case()
1623 .skip_category('C#')
1624 .skip_category('JS'),
1625
1626 generate_duplicate_fieldnames_case()
1627 .skip_category('C#')
1628 .skip_category('Go')
1629 .skip_category('JS'),
1630
1631 # TODO(ARROW-3039, ARROW-5267): Dictionaries in GO
1632 generate_dictionary_case()
1633 .skip_category('C#')
1634 .skip_category('Go'),
1635
1636 generate_dictionary_unsigned_case()
1637 .skip_category('C#')
1638 .skip_category('Go') # TODO(ARROW-9378)
1639 .skip_category('Java'), # TODO(ARROW-9377)
1640
1641 generate_nested_dictionary_case()
1642 .skip_category('C#')
1643 .skip_category('Go')
1644 .skip_category('Java') # TODO(ARROW-7779)
1645 .skip_category('JS')
1646 .skip_category('Rust'),
1647
1648 generate_extension_case()
1649 .skip_category('C#')
1650 .skip_category('Go') # TODO(ARROW-3039): requires dictionaries
1651 .skip_category('JS')
1652 .skip_category('Rust'),
1653 ]
1654
1655 generated_paths = []
1656 for file_obj in file_objs:
1657 out_path = os.path.join(tempdir, 'generated_' +
1658 file_obj.name + '.json')
1659 file_obj.write(out_path)
1660 generated_paths.append(file_obj)
1661
1662 return generated_paths