]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | /// Logical types, vector layouts, and schemas | |
19 | ||
20 | /// Format Version History. | |
21 | /// Version 1.0 - Forward and backwards compatibility guaranteed. | |
22 | /// Version 1.1 - Add Decimal256 (No format release). | |
23 | /// Version 1.2 (Pending)- Add Interval MONTH_DAY_NANO | |
24 | ||
25 | namespace org.apache.arrow.flatbuf; | |
26 | ||
27 | enum MetadataVersion:short { | |
28 | /// 0.1.0 (October 2016). | |
29 | V1, | |
30 | ||
31 | /// 0.2.0 (February 2017). Non-backwards compatible with V1. | |
32 | V2, | |
33 | ||
34 | /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. | |
35 | V3, | |
36 | ||
37 | /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. | |
38 | V4, | |
39 | ||
40 | /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 | |
41 | /// metadata and IPC messages). Implementations are recommended to provide a | |
42 | /// V4 compatibility mode with V5 format changes disabled. | |
43 | /// | |
44 | /// Incompatible changes between V4 and V5: | |
45 | /// - Union buffer layout has changed. In V5, Unions don't have a validity | |
46 | /// bitmap buffer. | |
47 | V5, | |
48 | } | |
49 | ||
50 | /// Represents Arrow Features that might not have full support | |
51 | /// within implementations. This is intended to be used in | |
52 | /// two scenarios: | |
53 | /// 1. A mechanism for readers of Arrow Streams | |
54 | /// and files to understand that the stream or file makes | |
55 | /// use of a feature that isn't supported or unknown to | |
56 | /// the implementation (and therefore can meet the Arrow | |
57 | /// forward compatibility guarantees). | |
58 | /// 2. A means of negotiating between a client and server | |
59 | /// what features a stream is allowed to use. The enums | |
60 | /// values here are intented to represent higher level | |
61 | /// features, additional details maybe negotiated | |
62 | /// with key-value pairs specific to the protocol. | |
63 | /// | |
64 | /// Enums added to this list should be assigned power-of-two values | |
65 | /// to facilitate exchanging and comparing bitmaps for supported | |
66 | /// features. | |
67 | enum Feature : long { | |
68 | /// Needed to make flatbuffers happy. | |
69 | UNUSED = 0, | |
70 | /// The stream makes use of multiple full dictionaries with the | |
71 | /// same ID and assumes clients implement dictionary replacement | |
72 | /// correctly. | |
73 | DICTIONARY_REPLACEMENT = 1, | |
74 | /// The stream makes use of compressed bodies as described | |
75 | /// in Message.fbs. | |
76 | COMPRESSED_BODY = 2 | |
77 | } | |
78 | ||
79 | /// These are stored in the flatbuffer in the Type union below | |
80 | ||
81 | table Null { | |
82 | } | |
83 | ||
84 | /// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct | |
85 | /// (according to the physical memory layout). We used Struct_ here as | |
86 | /// Struct is a reserved word in Flatbuffers | |
87 | table Struct_ { | |
88 | } | |
89 | ||
90 | table List { | |
91 | } | |
92 | ||
93 | /// Same as List, but with 64-bit offsets, allowing to represent | |
94 | /// extremely large data values. | |
95 | table LargeList { | |
96 | } | |
97 | ||
98 | table FixedSizeList { | |
99 | /// Number of list items per value | |
100 | listSize: int; | |
101 | } | |
102 | ||
103 | /// A Map is a logical nested type that is represented as | |
104 | /// | |
105 | /// List<entries: Struct<key: K, value: V>> | |
106 | /// | |
107 | /// In this layout, the keys and values are each respectively contiguous. We do | |
108 | /// not constrain the key and value types, so the application is responsible | |
109 | /// for ensuring that the keys are hashable and unique. Whether the keys are sorted | |
110 | /// may be set in the metadata for this field. | |
111 | /// | |
112 | /// In a field with Map type, the field has a child Struct field, which then | |
113 | /// has two children: key type and the second the value type. The names of the | |
114 | /// child fields may be respectively "entries", "key", and "value", but this is | |
115 | /// not enforced. | |
116 | /// | |
117 | /// Map | |
118 | /// ```text | |
119 | /// - child[0] entries: Struct | |
120 | /// - child[0] key: K | |
121 | /// - child[1] value: V | |
122 | /// ``` | |
123 | /// Neither the "entries" field nor the "key" field may be nullable. | |
124 | /// | |
125 | /// The metadata is structured so that Arrow systems without special handling | |
126 | /// for Map can make Map an alias for List. The "layout" attribute for the Map | |
127 | /// field must have the same contents as a List. | |
128 | table Map { | |
129 | /// Set to true if the keys within each value are sorted | |
130 | keysSorted: bool; | |
131 | } | |
132 | ||
133 | enum UnionMode:short { Sparse, Dense } | |
134 | ||
135 | /// A union is a complex type with children in Field | |
136 | /// By default ids in the type vector refer to the offsets in the children | |
137 | /// optionally typeIds provides an indirection between the child offset and the type id | |
138 | /// for each child `typeIds[offset]` is the id used in the type vector | |
139 | table Union { | |
140 | mode: UnionMode; | |
141 | typeIds: [ int ]; // optional, describes typeid of each child. | |
142 | } | |
143 | ||
144 | table Int { | |
145 | bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 | |
146 | is_signed: bool; | |
147 | } | |
148 | ||
149 | enum Precision:short {HALF, SINGLE, DOUBLE} | |
150 | ||
151 | table FloatingPoint { | |
152 | precision: Precision; | |
153 | } | |
154 | ||
155 | /// Unicode with UTF-8 encoding | |
156 | table Utf8 { | |
157 | } | |
158 | ||
159 | /// Opaque binary data | |
160 | table Binary { | |
161 | } | |
162 | ||
163 | /// Same as Utf8, but with 64-bit offsets, allowing to represent | |
164 | /// extremely large data values. | |
165 | table LargeUtf8 { | |
166 | } | |
167 | ||
168 | /// Same as Binary, but with 64-bit offsets, allowing to represent | |
169 | /// extremely large data values. | |
170 | table LargeBinary { | |
171 | } | |
172 | ||
173 | table FixedSizeBinary { | |
174 | /// Number of bytes per value | |
175 | byteWidth: int; | |
176 | } | |
177 | ||
178 | table Bool { | |
179 | } | |
180 | ||
181 | /// Exact decimal value represented as an integer value in two's | |
182 | /// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers | |
183 | /// are used. The representation uses the endianness indicated | |
184 | /// in the Schema. | |
185 | table Decimal { | |
186 | /// Total number of decimal digits | |
187 | precision: int; | |
188 | ||
189 | /// Number of digits after the decimal point "." | |
190 | scale: int; | |
191 | ||
192 | /// Number of bits per value. The only accepted widths are 128 and 256. | |
193 | /// We use bitWidth for consistency with Int::bitWidth. | |
194 | bitWidth: int = 128; | |
195 | } | |
196 | ||
197 | enum DateUnit: short { | |
198 | DAY, | |
199 | MILLISECOND | |
200 | } | |
201 | ||
202 | /// Date is either a 32-bit or 64-bit signed integer type representing an | |
203 | /// elapsed time since UNIX epoch (1970-01-01), stored in either of two units: | |
204 | /// | |
205 | /// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no | |
206 | /// leap seconds), where the values are evenly divisible by 86400000 | |
207 | /// * Days (32 bits) since the UNIX epoch | |
208 | table Date { | |
209 | unit: DateUnit = MILLISECOND; | |
210 | } | |
211 | ||
212 | enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } | |
213 | ||
214 | /// Time is either a 32-bit or 64-bit signed integer type representing an | |
215 | /// elapsed time since midnight, stored in either of four units: seconds, | |
216 | /// milliseconds, microseconds or nanoseconds. | |
217 | /// | |
218 | /// The integer `bitWidth` depends on the `unit` and must be one of the following: | |
219 | /// * SECOND and MILLISECOND: 32 bits | |
220 | /// * MICROSECOND and NANOSECOND: 64 bits | |
221 | /// | |
222 | /// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds | |
223 | /// (exclusive), adjusted for the time unit (for example, up to 86400000 | |
224 | /// exclusive for the MILLISECOND unit). | |
225 | /// This definition doesn't allow for leap seconds. Time values from | |
226 | /// measurements with leap seconds will need to be corrected when ingesting | |
227 | /// into Arrow (for example by replacing the value 86400 with 86399). | |
228 | table Time { | |
229 | unit: TimeUnit = MILLISECOND; | |
230 | bitWidth: int = 32; | |
231 | } | |
232 | ||
233 | /// Timestamp is a 64-bit signed integer representing an elapsed time since a | |
234 | /// fixed epoch, stored in either of four units: seconds, milliseconds, | |
235 | /// microseconds or nanoseconds, and is optionally annotated with a timezone. | |
236 | /// | |
237 | /// Timestamp values do not include any leap seconds (in other words, all | |
238 | /// days are considered 86400 seconds long). | |
239 | /// | |
240 | /// Timestamps with a non-empty timezone | |
241 | /// ------------------------------------ | |
242 | /// | |
243 | /// If a Timestamp column has a non-empty timezone value, its epoch is | |
244 | /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone | |
245 | /// (the Unix epoch), regardless of the Timestamp's own timezone. | |
246 | /// | |
247 | /// Therefore, timestamp values with a non-empty timezone correspond to | |
248 | /// physical points in time together with some additional information about | |
249 | /// how the data was obtained and/or how to display it (the timezone). | |
250 | /// | |
251 | /// For example, the timestamp value 0 with the timezone string "Europe/Paris" | |
252 | /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the | |
253 | /// application may prefer to display it as "January 1st 1970, 01h00" in | |
254 | /// the Europe/Paris timezone (which is the same physical point in time). | |
255 | /// | |
256 | /// One consequence is that timestamp values with a non-empty timezone | |
257 | /// can be compared and ordered directly, since they all share the same | |
258 | /// well-known point of reference (the Unix epoch). | |
259 | /// | |
260 | /// Timestamps with an unset / empty timezone | |
261 | /// ----------------------------------------- | |
262 | /// | |
263 | /// If a Timestamp column has no timezone value, its epoch is | |
264 | /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. | |
265 | /// | |
266 | /// Therefore, timestamp values without a timezone cannot be meaningfully | |
267 | /// interpreted as physical points in time, but only as calendar / clock | |
268 | /// indications ("wall clock time") in an unspecified timezone. | |
269 | /// | |
270 | /// For example, the timestamp value 0 with an empty timezone string | |
271 | /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there | |
272 | /// is not enough information to interpret it as a well-defined physical | |
273 | /// point in time. | |
274 | /// | |
275 | /// One consequence is that timestamp values without a timezone cannot | |
276 | /// be reliably compared or ordered, since they may have different points of | |
277 | /// reference. In particular, it is *not* possible to interpret an unset | |
278 | /// or empty timezone as the same as "UTC". | |
279 | /// | |
280 | /// Conversion between timezones | |
281 | /// ---------------------------- | |
282 | /// | |
283 | /// If a Timestamp column has a non-empty timezone, changing the timezone | |
284 | /// to a different non-empty value is a metadata-only operation: | |
285 | /// the timestamp values need not change as their point of reference remains | |
286 | /// the same (the Unix epoch). | |
287 | /// | |
288 | /// However, if a Timestamp column has no timezone value, changing it to a | |
289 | /// non-empty value requires to think about the desired semantics. | |
290 | /// One possibility is to assume that the original timestamp values are | |
291 | /// relative to the epoch of the timezone being set; timestamp values should | |
292 | /// then adjusted to the Unix epoch (for example, changing the timezone from | |
293 | /// empty to "Europe/Paris" would require converting the timestamp values | |
294 | /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is | |
295 | /// nevertheless correct). | |
296 | /// | |
297 | /// Guidelines for encoding data from external libraries | |
298 | /// ---------------------------------------------------- | |
299 | /// | |
300 | /// Date & time libraries often have multiple different data types for temporal | |
301 | /// data. In order to ease interoperability between different implementations the | |
302 | /// Arrow project has some recommendations for encoding these types into a Timestamp | |
303 | /// column. | |
304 | /// | |
305 | /// An "instant" represents a physical point in time that has no relevant timezone | |
306 | /// (for example, astronomical data). To encode an instant, use a Timestamp with | |
307 | /// the timezone string set to "UTC", and make sure the Timestamp values | |
308 | /// are relative to the UTC epoch (January 1st 1970, midnight). | |
309 | /// | |
310 | /// A "zoned date-time" represents a physical point in time annotated with an | |
311 | /// informative timezone (for example, the timezone in which the data was | |
312 | /// recorded). To encode a zoned date-time, use a Timestamp with the timezone | |
313 | /// string set to the name of the timezone, and make sure the Timestamp values | |
314 | /// are relative to the UTC epoch (January 1st 1970, midnight). | |
315 | /// | |
316 | /// (There is some ambiguity between an instant and a zoned date-time with the | |
317 | /// UTC timezone. Both of these are stored the same in Arrow. Typically, | |
318 | /// this distinction does not matter. If it does, then an application should | |
319 | /// use custom metadata or an extension type to distinguish between the two cases.) | |
320 | /// | |
321 | /// An "offset date-time" represents a physical point in time combined with an | |
322 | /// explicit offset from UTC. To encode an offset date-time, use a Timestamp | |
323 | /// with the timezone string set to the numeric timezone offset string | |
324 | /// (e.g. "+03:00"), and make sure the Timestamp values are relative to | |
325 | /// the UTC epoch (January 1st 1970, midnight). | |
326 | /// | |
327 | /// A "naive date-time" (also called "local date-time" in some libraries) | |
328 | /// represents a wall clock time combined with a calendar date, but with | |
329 | /// no indication of how to map this information to a physical point in time. | |
330 | /// Naive date-times must be handled with care because of this missing | |
331 | /// information, and also because daylight saving time (DST) may make | |
332 | /// some values ambiguous or non-existent. A naive date-time may be | |
333 | /// stored as a struct with Date and Time fields. However, it may also be | |
334 | /// encoded into a Timestamp column with an empty timezone. The timestamp | |
335 | /// values should be computed "as if" the timezone of the date-time values | |
336 | /// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would | |
337 | /// be encoded as timestamp value 0. | |
338 | table Timestamp { | |
339 | unit: TimeUnit; | |
340 | ||
341 | /// The timezone is an optional string indicating the name of a timezone, | |
342 | /// one of: | |
343 | /// | |
344 | /// * As used in the Olson timezone database (the "tz database" or | |
345 | /// "tzdata"), such as "America/New_York". | |
346 | /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX", | |
347 | /// such as "+07:30". | |
348 | /// | |
349 | /// Whether a timezone string is present indicates different semantics about | |
350 | /// the data (see above). | |
351 | timezone: string; | |
352 | } | |
353 | ||
354 | enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO} | |
355 | // A "calendar" interval which models types that don't necessarily | |
356 | // have a precise duration without the context of a base timestamp (e.g. | |
357 | // days can differ in length during day light savings time transitions). | |
358 | // All integers in the types below are stored in the endianness indicated | |
359 | // by the schema. | |
360 | // | |
361 | // YEAR_MONTH - Indicates the number of elapsed whole months, stored as | |
362 | // 4-byte signed integers. | |
363 | // DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds), | |
364 | // stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support | |
365 | // of this IntervalUnit is not required for full arrow compatibility. | |
366 | // MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds. | |
367 | // The values are stored contiguously in 16-byte blocks. Months and days are | |
368 | // encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit | |
369 | // signed integer. Nanoseconds does not allow for leap seconds. Each field is | |
370 | // independent (e.g. there is no constraint that nanoseconds have the same | |
371 | // sign as days or that the quantity of nanoseconds represents less than a | |
372 | // day's worth of time). | |
373 | table Interval { | |
374 | unit: IntervalUnit; | |
375 | } | |
376 | ||
377 | // An absolute length of time unrelated to any calendar artifacts. | |
378 | // | |
379 | // For the purposes of Arrow Implementations, adding this value to a Timestamp | |
380 | // ("t1") naively (i.e. simply summing the two number) is acceptable even | |
381 | // though in some cases the resulting Timestamp (t2) would not account for | |
382 | // leap-seconds during the elapsed time between "t1" and "t2". Similarly, | |
383 | // representing the difference between two Unix timestamp is acceptable, but | |
384 | // would yield a value that is possibly a few seconds off from the true elapsed | |
385 | // time. | |
386 | // | |
387 | // The resolution defaults to millisecond, but can be any of the other | |
388 | // supported TimeUnit values as with Timestamp and Time types. This type is | |
389 | // always represented as an 8-byte integer. | |
390 | table Duration { | |
391 | unit: TimeUnit = MILLISECOND; | |
392 | } | |
393 | ||
394 | /// ---------------------------------------------------------------------- | |
395 | /// Top-level Type value, enabling extensible type-specific metadata. We can | |
396 | /// add new logical types to Type without breaking backwards compatibility | |
397 | ||
398 | union Type { | |
399 | Null, | |
400 | Int, | |
401 | FloatingPoint, | |
402 | Binary, | |
403 | Utf8, | |
404 | Bool, | |
405 | Decimal, | |
406 | Date, | |
407 | Time, | |
408 | Timestamp, | |
409 | Interval, | |
410 | List, | |
411 | Struct_, | |
412 | Union, | |
413 | FixedSizeBinary, | |
414 | FixedSizeList, | |
415 | Map, | |
416 | Duration, | |
417 | LargeBinary, | |
418 | LargeUtf8, | |
419 | LargeList, | |
420 | } | |
421 | ||
422 | /// ---------------------------------------------------------------------- | |
423 | /// user defined key value pairs to add custom metadata to arrow | |
424 | /// key namespacing is the responsibility of the user | |
425 | ||
426 | table KeyValue { | |
427 | key: string; | |
428 | value: string; | |
429 | } | |
430 | ||
431 | /// ---------------------------------------------------------------------- | |
432 | /// Dictionary encoding metadata | |
433 | /// Maintained for forwards compatibility, in the future | |
434 | /// Dictionaries might be explicit maps between integers and values | |
435 | /// allowing for non-contiguous index values | |
436 | enum DictionaryKind : short { DenseArray } | |
437 | table DictionaryEncoding { | |
438 | /// The known dictionary id in the application where this data is used. In | |
439 | /// the file or streaming formats, the dictionary ids are found in the | |
440 | /// DictionaryBatch messages | |
441 | id: long; | |
442 | ||
443 | /// The dictionary indices are constrained to be non-negative integers. If | |
444 | /// this field is null, the indices must be signed int32. To maximize | |
445 | /// cross-language compatibility and performance, implementations are | |
446 | /// recommended to prefer signed integer types over unsigned integer types | |
447 | /// and to avoid uint64 indices unless they are required by an application. | |
448 | indexType: Int; | |
449 | ||
450 | /// By default, dictionaries are not ordered, or the order does not have | |
451 | /// semantic meaning. In some statistical, applications, dictionary-encoding | |
452 | /// is used to represent ordered categorical data, and we provide a way to | |
453 | /// preserve that metadata here | |
454 | isOrdered: bool; | |
455 | ||
456 | dictionaryKind: DictionaryKind; | |
457 | } | |
458 | ||
459 | /// ---------------------------------------------------------------------- | |
460 | /// A field represents a named column in a record / row batch or child of a | |
461 | /// nested type. | |
462 | ||
463 | table Field { | |
464 | /// Name is not required, in i.e. a List | |
465 | name: string; | |
466 | ||
467 | /// Whether or not this field can contain nulls. Should be true in general. | |
468 | nullable: bool; | |
469 | ||
470 | /// This is the type of the decoded value if the field is dictionary encoded. | |
471 | type: Type; | |
472 | ||
473 | /// Present only if the field is dictionary encoded. | |
474 | dictionary: DictionaryEncoding; | |
475 | ||
476 | /// children apply only to nested data types like Struct, List and Union. For | |
477 | /// primitive types children will have length 0. | |
478 | children: [ Field ]; | |
479 | ||
480 | /// User-defined metadata | |
481 | custom_metadata: [ KeyValue ]; | |
482 | } | |
483 | ||
484 | /// ---------------------------------------------------------------------- | |
485 | /// Endianness of the platform producing the data | |
486 | ||
487 | enum Endianness:short { Little, Big } | |
488 | ||
489 | /// ---------------------------------------------------------------------- | |
490 | /// A Buffer represents a single contiguous memory segment | |
491 | struct Buffer { | |
492 | /// The relative offset into the shared memory page where the bytes for this | |
493 | /// buffer starts | |
494 | offset: long; | |
495 | ||
496 | /// The absolute length (in bytes) of the memory buffer. The memory is found | |
497 | /// from offset (inclusive) to offset + length (non-inclusive). When building | |
498 | /// messages using the encapsulated IPC message, padding bytes may be written | |
499 | /// after a buffer, but such padding bytes do not need to be accounted for in | |
500 | /// the size here. | |
501 | length: long; | |
502 | } | |
503 | ||
504 | /// ---------------------------------------------------------------------- | |
505 | /// A Schema describes the columns in a row batch | |
506 | ||
507 | table Schema { | |
508 | ||
509 | /// endianness of the buffer | |
510 | /// it is Little Endian by default | |
511 | /// if endianness doesn't match the underlying system then the vectors need to be converted | |
512 | endianness: Endianness=Little; | |
513 | ||
514 | fields: [Field]; | |
515 | // User-defined metadata | |
516 | custom_metadata: [ KeyValue ]; | |
517 | ||
518 | /// Features used in the stream/file. | |
519 | features : [ Feature ]; | |
520 | } | |
521 | ||
522 | root_type Schema; |