]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/python/serialize.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / python / serialize.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #pragma once
19
20 #include <memory>
21 #include <vector>
22
23 #include "arrow/ipc/options.h"
24 #include "arrow/python/visibility.h"
25 #include "arrow/sparse_tensor.h"
26 #include "arrow/status.h"
27
28 // Forward declaring PyObject, see
29 // https://mail.python.org/pipermail/python-dev/2003-August/037601.html
30 #ifndef PyObject_HEAD
31 struct _object;
32 typedef _object PyObject;
33 #endif
34
35 namespace arrow {
36
37 class Buffer;
38 class DataType;
39 class MemoryPool;
40 class RecordBatch;
41 class Tensor;
42
43 namespace io {
44
45 class OutputStream;
46
47 } // namespace io
48
49 namespace py {
50
51 struct ARROW_PYTHON_EXPORT SerializedPyObject {
52 std::shared_ptr<RecordBatch> batch;
53 std::vector<std::shared_ptr<Tensor>> tensors;
54 std::vector<std::shared_ptr<SparseTensor>> sparse_tensors;
55 std::vector<std::shared_ptr<Tensor>> ndarrays;
56 std::vector<std::shared_ptr<Buffer>> buffers;
57 ipc::IpcWriteOptions ipc_options;
58
59 SerializedPyObject();
60
61 /// \brief Write serialized Python object to OutputStream
62 /// \param[in,out] dst an OutputStream
63 /// \return Status
64 Status WriteTo(io::OutputStream* dst);
65
66 /// \brief Convert SerializedPyObject to a dict containing the message
67 /// components as Buffer instances with minimal memory allocation
68 ///
69 /// {
70 /// 'num_tensors': M,
71 /// 'num_sparse_tensors': N,
72 /// 'num_buffers': K,
73 /// 'data': [Buffer]
74 /// }
75 ///
76 /// Each tensor is written as two buffers, one for the metadata and one for
77 /// the body. Therefore, the number of buffers in 'data' is 2 * M + 2 * N + K + 1,
78 /// with the first buffer containing the serialized record batch containing
79 /// the UnionArray that describes the whole object
80 Status GetComponents(MemoryPool* pool, PyObject** out);
81 };
82
83 /// \brief Serialize Python sequence as a SerializedPyObject.
84 /// \param[in] context Serialization context which contains custom serialization
85 /// and deserialization callbacks. Can be any Python object with a
86 /// _serialize_callback method for serialization and a _deserialize_callback
87 /// method for deserialization. If context is None, no custom serialization
88 /// will be attempted.
89 /// \param[in] sequence A Python sequence object to serialize to Arrow data
90 /// structures
91 /// \param[out] out The serialized representation
92 /// \return Status
93 ///
94 /// Release GIL before calling
95 ARROW_PYTHON_EXPORT
96 Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out);
97
98 /// \brief Serialize an Arrow Tensor as a SerializedPyObject.
99 /// \param[in] tensor Tensor to be serialized
100 /// \param[out] out The serialized representation
101 /// \return Status
102 ARROW_PYTHON_EXPORT
103 Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* out);
104
105 /// \brief Write the Tensor metadata header to an OutputStream.
106 /// \param[in] dtype DataType of the Tensor
107 /// \param[in] shape The shape of the tensor
108 /// \param[in] tensor_num_bytes The length of the Tensor data in bytes
109 /// \param[in] dst The OutputStream to write the Tensor header to
110 /// \return Status
111 ARROW_PYTHON_EXPORT
112 Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype,
113 const std::vector<int64_t>& shape, int64_t tensor_num_bytes,
114 io::OutputStream* dst);
115
116 struct PythonType {
117 enum type {
118 NONE,
119 BOOL,
120 INT,
121 PY2INT, // Kept for compatibility
122 BYTES,
123 STRING,
124 HALF_FLOAT,
125 FLOAT,
126 DOUBLE,
127 DATE64,
128 LIST,
129 DICT,
130 TUPLE,
131 SET,
132 TENSOR,
133 NDARRAY,
134 BUFFER,
135 SPARSECOOTENSOR,
136 SPARSECSRMATRIX,
137 SPARSECSCMATRIX,
138 SPARSECSFTENSOR,
139 NUM_PYTHON_TYPES
140 };
141 };
142
143 } // namespace py
144
145 } // namespace arrow