]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/python/pyarrow/__init__.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / python / pyarrow / __init__.py
CommitLineData
1d09f67e
TL
1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements. See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership. The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License. You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied. See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18# flake8: noqa
19
20"""
21PyArrow is the python implementation of Apache Arrow.
22
23Apache Arrow is a cross-language development platform for in-memory data.
24It specifies a standardized language-independent columnar memory format for
25flat and hierarchical data, organized for efficient analytic operations on
26modern hardware. It also provides computational libraries and zero-copy
27streaming messaging and interprocess communication.
28
29For more information see the official page at https://arrow.apache.org
30"""
31
32import gc as _gc
33import os as _os
34import sys as _sys
35import warnings as _warnings
36
37try:
38 from ._generated_version import version as __version__
39except ImportError:
40 # Package is not installed, parse git tag at runtime
41 try:
42 import setuptools_scm
43 # Code duplicated from setup.py to avoid a dependency on each other
44
45 def parse_git(root, **kwargs):
46 """
47 Parse function for setuptools_scm that ignores tags for non-C++
48 subprojects, e.g. apache-arrow-js-XXX tags.
49 """
50 from setuptools_scm.git import parse
51 kwargs['describe_command'] = \
52 "git describe --dirty --tags --long --match 'apache-arrow-[0-9].*'"
53 return parse(root, **kwargs)
54 __version__ = setuptools_scm.get_version('../',
55 parse=parse_git)
56 except ImportError:
57 __version__ = None
58
59# ARROW-8684: Disable GC while initializing Cython extension module,
60# to workaround Cython bug in https://github.com/cython/cython/issues/3603
61_gc_enabled = _gc.isenabled()
62_gc.disable()
63import pyarrow.lib as _lib
64if _gc_enabled:
65 _gc.enable()
66
67from pyarrow.lib import (BuildInfo, RuntimeInfo, MonthDayNano,
68 VersionInfo, cpp_build_info, cpp_version,
69 cpp_version_info, runtime_info, cpu_count,
70 set_cpu_count, enable_signal_handlers,
71 io_thread_count, set_io_thread_count)
72
73
74def show_versions():
75 """
76 Print various version information, to help with error reporting.
77 """
78 # TODO: CPU information and flags
79 print("pyarrow version info\n--------------------")
80 print("Package kind: {}".format(cpp_build_info.package_kind
81 if len(cpp_build_info.package_kind) > 0
82 else "not indicated"))
83 print("Arrow C++ library version: {0}".format(cpp_build_info.version))
84 print("Arrow C++ compiler: {0} {1}"
85 .format(cpp_build_info.compiler_id, cpp_build_info.compiler_version))
86 print("Arrow C++ compiler flags: {0}"
87 .format(cpp_build_info.compiler_flags))
88 print("Arrow C++ git revision: {0}".format(cpp_build_info.git_id))
89 print("Arrow C++ git description: {0}"
90 .format(cpp_build_info.git_description))
91
92
93from pyarrow.lib import (null, bool_,
94 int8, int16, int32, int64,
95 uint8, uint16, uint32, uint64,
96 time32, time64, timestamp, date32, date64, duration,
97 month_day_nano_interval,
98 float16, float32, float64,
99 binary, string, utf8,
100 large_binary, large_string, large_utf8,
101 decimal128, decimal256,
102 list_, large_list, map_, struct,
103 union, sparse_union, dense_union,
104 dictionary,
105 field,
106 type_for_alias,
107 DataType, DictionaryType, StructType,
108 ListType, LargeListType, MapType, FixedSizeListType,
109 UnionType, SparseUnionType, DenseUnionType,
110 TimestampType, Time32Type, Time64Type, DurationType,
111 FixedSizeBinaryType, Decimal128Type, Decimal256Type,
112 BaseExtensionType, ExtensionType,
113 PyExtensionType, UnknownExtensionType,
114 register_extension_type, unregister_extension_type,
115 DictionaryMemo,
116 KeyValueMetadata,
117 Field,
118 Schema,
119 schema,
120 unify_schemas,
121 Array, Tensor,
122 array, chunked_array, record_batch, nulls, repeat,
123 SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
124 SparseCSFTensor,
125 infer_type, from_numpy_dtype,
126 NullArray,
127 NumericArray, IntegerArray, FloatingPointArray,
128 BooleanArray,
129 Int8Array, UInt8Array,
130 Int16Array, UInt16Array,
131 Int32Array, UInt32Array,
132 Int64Array, UInt64Array,
133 ListArray, LargeListArray, MapArray,
134 FixedSizeListArray, UnionArray,
135 BinaryArray, StringArray,
136 LargeBinaryArray, LargeStringArray,
137 FixedSizeBinaryArray,
138 DictionaryArray,
139 Date32Array, Date64Array, TimestampArray,
140 Time32Array, Time64Array, DurationArray,
141 MonthDayNanoIntervalArray,
142 Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
143 scalar, NA, _NULL as NULL, Scalar,
144 NullScalar, BooleanScalar,
145 Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
146 UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
147 HalfFloatScalar, FloatScalar, DoubleScalar,
148 Decimal128Scalar, Decimal256Scalar,
149 ListScalar, LargeListScalar, FixedSizeListScalar,
150 Date32Scalar, Date64Scalar,
151 Time32Scalar, Time64Scalar,
152 TimestampScalar, DurationScalar,
153 MonthDayNanoIntervalScalar,
154 BinaryScalar, LargeBinaryScalar,
155 StringScalar, LargeStringScalar,
156 FixedSizeBinaryScalar, DictionaryScalar,
157 MapScalar, StructScalar, UnionScalar,
158 ExtensionScalar)
159
160# Buffers, allocation
161from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
162 Codec, compress, decompress, allocate_buffer)
163
164from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool,
165 total_allocated_bytes, set_memory_pool,
166 default_memory_pool, system_memory_pool,
167 jemalloc_memory_pool, mimalloc_memory_pool,
168 logging_memory_pool, proxy_memory_pool,
169 log_memory_allocations, jemalloc_set_decay_ms)
170
171# I/O
172from pyarrow.lib import (NativeFile, PythonFile,
173 BufferedInputStream, BufferedOutputStream,
174 CompressedInputStream, CompressedOutputStream,
175 TransformInputStream, transcoding_input_stream,
176 FixedSizeBufferWriter,
177 BufferReader, BufferOutputStream,
178 OSFile, MemoryMappedFile, memory_map,
179 create_memory_map, MockOutputStream,
180 input_stream, output_stream)
181
182from pyarrow._hdfsio import HdfsFile, have_libhdfs
183
184from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table,
185 concat_arrays, concat_tables)
186
187# Exceptions
188from pyarrow.lib import (ArrowCancelled,
189 ArrowCapacityError,
190 ArrowException,
191 ArrowKeyError,
192 ArrowIndexError,
193 ArrowInvalid,
194 ArrowIOError,
195 ArrowMemoryError,
196 ArrowNotImplementedError,
197 ArrowTypeError,
198 ArrowSerializationError)
199
200# Serialization
201from pyarrow.lib import (deserialize_from, deserialize,
202 deserialize_components,
203 serialize, serialize_to, read_serialized,
204 SerializationCallbackError,
205 DeserializationCallbackError)
206
207import pyarrow.hdfs as hdfs
208
209from pyarrow.ipc import serialize_pandas, deserialize_pandas
210import pyarrow.ipc as ipc
211
212from pyarrow.serialization import (default_serialization_context,
213 register_default_serialization_handlers,
214 register_torch_serialization_handlers)
215
216import pyarrow.types as types
217
218
219# deprecated top-level access
220
221
222from pyarrow.filesystem import FileSystem as _FileSystem
223from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem
224from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem
225
226from pyarrow.lib import SerializationContext as _SerializationContext
227from pyarrow.lib import SerializedPyObject as _SerializedPyObject
228
229
230_localfs = _LocalFileSystem._get_instance()
231
232
233_msg = (
234 "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
235)
236
237_serialization_msg = (
238 "'pyarrow.{0}' is deprecated and will be removed in a future version. "
239 "Use pickle or the pyarrow IPC functionality instead."
240)
241
242_deprecated = {
243 "localfs": (_localfs, "LocalFileSystem"),
244 "FileSystem": (_FileSystem, "FileSystem"),
245 "LocalFileSystem": (_LocalFileSystem, "LocalFileSystem"),
246 "HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"),
247}
248
249_serialization_deprecatd = {
250 "SerializationContext": _SerializationContext,
251 "SerializedPyObject": _SerializedPyObject,
252}
253
254if _sys.version_info >= (3, 7):
255 def __getattr__(name):
256 if name in _deprecated:
257 obj, new_name = _deprecated[name]
258 _warnings.warn(_msg.format(name, new_name),
259 FutureWarning, stacklevel=2)
260 return obj
261 elif name in _serialization_deprecatd:
262 _warnings.warn(_serialization_msg.format(name),
263 FutureWarning, stacklevel=2)
264 return _serialization_deprecatd[name]
265
266 raise AttributeError(
267 "module 'pyarrow' has no attribute '{0}'".format(name)
268 )
269else:
270 localfs = _localfs
271 FileSystem = _FileSystem
272 LocalFileSystem = _LocalFileSystem
273 HadoopFileSystem = _HadoopFileSystem
274 SerializationContext = _SerializationContext
275 SerializedPyObject = _SerializedPyObject
276
277
278# Entry point for starting the plasma store
279
280
281def _plasma_store_entry_point():
282 """Entry point for starting the plasma store.
283
284 This can be used by invoking e.g.
285 ``plasma_store -s /tmp/plasma -m 1000000000``
286 from the command line and will start the plasma_store executable with the
287 given arguments.
288 """
289 import pyarrow
290 plasma_store_executable = _os.path.join(pyarrow.__path__[0],
291 "plasma-store-server")
292 _os.execv(plasma_store_executable, _sys.argv)
293
294
295# ----------------------------------------------------------------------
296# Deprecations
297
298from pyarrow.util import _deprecate_api, _deprecate_class
299
300read_message = _deprecate_api("read_message", "ipc.read_message",
301 ipc.read_message, "0.17.0")
302
303read_record_batch = _deprecate_api("read_record_batch",
304 "ipc.read_record_batch",
305 ipc.read_record_batch, "0.17.0")
306
307read_schema = _deprecate_api("read_schema", "ipc.read_schema",
308 ipc.read_schema, "0.17.0")
309
310read_tensor = _deprecate_api("read_tensor", "ipc.read_tensor",
311 ipc.read_tensor, "0.17.0")
312
313write_tensor = _deprecate_api("write_tensor", "ipc.write_tensor",
314 ipc.write_tensor, "0.17.0")
315
316get_record_batch_size = _deprecate_api("get_record_batch_size",
317 "ipc.get_record_batch_size",
318 ipc.get_record_batch_size, "0.17.0")
319
320get_tensor_size = _deprecate_api("get_tensor_size",
321 "ipc.get_tensor_size",
322 ipc.get_tensor_size, "0.17.0")
323
324open_stream = _deprecate_api("open_stream", "ipc.open_stream",
325 ipc.open_stream, "0.17.0")
326
327open_file = _deprecate_api("open_file", "ipc.open_file", ipc.open_file,
328 "0.17.0")
329
330
331def _deprecate_scalar(ty, symbol):
332 return _deprecate_class("{}Value".format(ty), symbol, "1.0.0")
333
334
335ArrayValue = _deprecate_class("ArrayValue", Scalar, "1.0.0")
336NullType = _deprecate_class("NullType", NullScalar, "1.0.0")
337
338BooleanValue = _deprecate_scalar("Boolean", BooleanScalar)
339Int8Value = _deprecate_scalar("Int8", Int8Scalar)
340Int16Value = _deprecate_scalar("Int16", Int16Scalar)
341Int32Value = _deprecate_scalar("Int32", Int32Scalar)
342Int64Value = _deprecate_scalar("Int64", Int64Scalar)
343UInt8Value = _deprecate_scalar("UInt8", UInt8Scalar)
344UInt16Value = _deprecate_scalar("UInt16", UInt16Scalar)
345UInt32Value = _deprecate_scalar("UInt32", UInt32Scalar)
346UInt64Value = _deprecate_scalar("UInt64", UInt64Scalar)
347HalfFloatValue = _deprecate_scalar("HalfFloat", HalfFloatScalar)
348FloatValue = _deprecate_scalar("Float", FloatScalar)
349DoubleValue = _deprecate_scalar("Double", DoubleScalar)
350ListValue = _deprecate_scalar("List", ListScalar)
351LargeListValue = _deprecate_scalar("LargeList", LargeListScalar)
352MapValue = _deprecate_scalar("Map", MapScalar)
353FixedSizeListValue = _deprecate_scalar("FixedSizeList", FixedSizeListScalar)
354BinaryValue = _deprecate_scalar("Binary", BinaryScalar)
355StringValue = _deprecate_scalar("String", StringScalar)
356LargeBinaryValue = _deprecate_scalar("LargeBinary", LargeBinaryScalar)
357LargeStringValue = _deprecate_scalar("LargeString", LargeStringScalar)
358FixedSizeBinaryValue = _deprecate_scalar("FixedSizeBinary",
359 FixedSizeBinaryScalar)
360Decimal128Value = _deprecate_scalar("Decimal128", Decimal128Scalar)
361Decimal256Value = _deprecate_scalar("Decimal256", Decimal256Scalar)
362UnionValue = _deprecate_scalar("Union", UnionScalar)
363StructValue = _deprecate_scalar("Struct", StructScalar)
364DictionaryValue = _deprecate_scalar("Dictionary", DictionaryScalar)
365Date32Value = _deprecate_scalar("Date32", Date32Scalar)
366Date64Value = _deprecate_scalar("Date64", Date64Scalar)
367Time32Value = _deprecate_scalar("Time32", Time32Scalar)
368Time64Value = _deprecate_scalar("Time64", Time64Scalar)
369TimestampValue = _deprecate_scalar("Timestamp", TimestampScalar)
370DurationValue = _deprecate_scalar("Duration", DurationScalar)
371
372
373# TODO: Deprecate these somehow in the pyarrow namespace
374from pyarrow.ipc import (Message, MessageReader, MetadataVersion,
375 RecordBatchFileReader, RecordBatchFileWriter,
376 RecordBatchStreamReader, RecordBatchStreamWriter)
377
378# ----------------------------------------------------------------------
379# Returning absolute path to the pyarrow include directory (if bundled, e.g. in
380# wheels)
381
382
383def get_include():
384 """
385 Return absolute path to directory containing Arrow C++ include
386 headers. Similar to numpy.get_include
387 """
388 return _os.path.join(_os.path.dirname(__file__), 'include')
389
390
391def _get_pkg_config_executable():
392 return _os.environ.get('PKG_CONFIG', 'pkg-config')
393
394
395def _has_pkg_config(pkgname):
396 import subprocess
397 try:
398 return subprocess.call([_get_pkg_config_executable(),
399 '--exists', pkgname]) == 0
400 except FileNotFoundError:
401 return False
402
403
404def _read_pkg_config_variable(pkgname, cli_args):
405 import subprocess
406 cmd = [_get_pkg_config_executable(), pkgname] + cli_args
407 proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
408 stderr=subprocess.PIPE)
409 out, err = proc.communicate()
410 if proc.returncode != 0:
411 raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
412 return out.rstrip().decode('utf8')
413
414
415def get_libraries():
416 """
417 Return list of library names to include in the `libraries` argument for C
418 or Cython extensions using pyarrow
419 """
420 return ['arrow', 'arrow_python']
421
422
423def create_library_symlinks():
424 """
425 With Linux and macOS wheels, the bundled shared libraries have an embedded
426 ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them
427 with -larrow won't work unless we create symlinks at locations like
428 site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses
429 prior problems we had with shipping two copies of the shared libraries to
430 permit third party projects like turbodbc to build their C++ extensions
431 against the pyarrow wheels.
432
433 This function must only be invoked once and only when the shared libraries
434 are bundled with the Python package, which should only apply to wheel-based
435 installs. It requires write access to the site-packages/pyarrow directory
436 and so depending on your system may need to be run with root.
437 """
438 import glob
439 if _sys.platform == 'win32':
440 return
441 package_cwd = _os.path.dirname(__file__)
442
443 if _sys.platform == 'linux':
444 bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*'))
445
446 def get_symlink_path(hard_path):
447 return hard_path.rsplit('.', 1)[0]
448 else:
449 bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib'))
450
451 def get_symlink_path(hard_path):
452 return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib'))
453
454 for lib_hard_path in bundled_libs:
455 symlink_path = get_symlink_path(lib_hard_path)
456 if _os.path.exists(symlink_path):
457 continue
458 try:
459 _os.symlink(lib_hard_path, symlink_path)
460 except PermissionError:
461 print("Tried creating symlink {}. If you need to link to "
462 "bundled shared libraries, run "
463 "pyarrow.create_library_symlinks() as root")
464
465
466def get_library_dirs():
467 """
468 Return lists of directories likely to contain Arrow C++ libraries for
469 linking C or Cython extensions using pyarrow
470 """
471 package_cwd = _os.path.dirname(__file__)
472 library_dirs = [package_cwd]
473
474 def append_library_dir(library_dir):
475 if library_dir not in library_dirs:
476 library_dirs.append(library_dir)
477
478 # Search library paths via pkg-config. This is necessary if the user
479 # installed libarrow and the other shared libraries manually and they
480 # are not shipped inside the pyarrow package (see also ARROW-2976).
481 pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
482 for pkgname in ["arrow", "arrow_python"]:
483 if _has_pkg_config(pkgname):
484 library_dir = _read_pkg_config_variable(pkgname,
485 ["--libs-only-L"])
486 # pkg-config output could be empty if Arrow is installed
487 # as a system package.
488 if library_dir:
489 if not library_dir.startswith("-L"):
490 raise ValueError(
491 "pkg-config --libs-only-L returned unexpected "
492 "value {!r}".format(library_dir))
493 append_library_dir(library_dir[2:])
494
495 if _sys.platform == 'win32':
496 # TODO(wesm): Is this necessary, or does setuptools within a conda
497 # installation add Library\lib to the linker path for MSVC?
498 python_base_install = _os.path.dirname(_sys.executable)
499 library_dir = _os.path.join(python_base_install, 'Library', 'lib')
500
501 if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
502 append_library_dir(library_dir)
503
504 # ARROW-4074: Allow for ARROW_HOME to be set to some other directory
505 if _os.environ.get('ARROW_HOME'):
506 append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
507 else:
508 # Python wheels bundle the Arrow libraries in the pyarrow directory.
509 append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
510
511 return library_dirs