[ceph.git] / ceph / src / arrow / python / pyarrow / tests / test_schema.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from collections import OrderedDict
import pickle
import sys
import weakref

import pytest
import numpy as np
import pyarrow as pa

import pyarrow.tests.util as test_util
from pyarrow.vendored.version import Version


def test_schema_constructor_errors():
    msg = ("Do not call Schema's constructor directly, use `pyarrow.schema` "
           "instead")
    with pytest.raises(TypeError, match=msg):
        pa.Schema()


def test_type_integers():
    dtypes = ['int8', 'int16', 'int32', 'int64',
              'uint8', 'uint16', 'uint32', 'uint64']

    for name in dtypes:
        factory = getattr(pa, name)
        t = factory()
        assert str(t) == name


def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.object_),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
        # (pa.list_(pa.int8(), 2), np.object_),  # TODO needs pandas conversion
        (pa.map_(pa.int64(), pa.float64()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type


@pytest.mark.pandas
def test_type_to_pandas_dtype_check_import():
    # ARROW-7980
    test_util.invoke_script('arrow_7980.py')


def test_type_list():
    value_type = pa.int32()
    list_type = pa.list_(value_type)
    assert str(list_type) == 'list<item: int32>'

    field = pa.field('my_item', pa.string())
    l2 = pa.list_(field)
    assert str(l2) == 'list<my_item: string>'


def test_type_comparisons():
    val = pa.int32()
    assert val == pa.int32()
    assert val == 'int32'
    assert val != 5


def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
        ('duration[s]', pa.duration('s')),
        ('duration[ms]', pa.duration('ms')),
        ('duration[us]', pa.duration('us')),
        ('duration[ns]', pa.duration('ns')),
        ('month_day_nano_interval', pa.month_day_nano_interval()),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected


def test_type_string():
    t = pa.string()
    assert str(t) == 'string'


def test_type_timestamp_with_tz():
    tz = 'America/Los_Angeles'
    t = pa.timestamp('ns', tz=tz)
    assert t.unit == 'ns'
    assert t.tz == tz


def test_time_types():
    t1 = pa.time32('s')
    t2 = pa.time32('ms')
    t3 = pa.time64('us')
    t4 = pa.time64('ns')

    assert t1.unit == 's'
    assert t2.unit == 'ms'
    assert t3.unit == 'us'
    assert t4.unit == 'ns'

    assert str(t1) == 'time32[s]'
    assert str(t4) == 'time64[ns]'

    with pytest.raises(ValueError):
        pa.time32('us')

    with pytest.raises(ValueError):
        pa.time64('s')


def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns')),
        (np.dtype('timedelta64[s]'), pa.duration('s')),
        (np.dtype('timedelta64[ms]'), pa.duration('ms')),
        (np.dtype('timedelta64[us]'), pa.duration('us')),
        (np.dtype('timedelta64[ns]'), pa.duration('ns')),
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.str_) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')


def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field('foo').name == 'foo'
    assert sch.field('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])


def test_schema_weakref():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    schema = pa.schema(fields)
    wr = weakref.ref(schema)
    assert wr() is not None
    del schema
    assert wr() is None


def test_schema_to_string_with_metadata():
    lorem = """\
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan vel
turpis et mollis. Aliquam tincidunt arcu id tortor blandit blandit. Donec
eget leo quis lectus scelerisque varius. Class aptent taciti sociosqu ad
litora torquent per conubia nostra, per inceptos himenaeos. Praesent
faucibus, diam eu volutpat iaculis, tellus est porta ligula, a efficitur
turpis nulla facilisis quam. Aliquam vitae lorem erat. Proin a dolor ac libero
dignissim mollis vitae eu mauris. Quisque posuere tellus vitae massa
pellentesque sagittis. Aenean feugiat, diam ac dignissim fermentum, lorem
sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit
sapien. Quisque pretium vestibulum urna eu vehicula."""
    # ARROW-7063
    my_schema = pa.schema([pa.field("foo", "int32", False,
                                    metadata={"key1": "value1"}),
                           pa.field("bar", "string", True,
                                    metadata={"key3": "value3"})],
                          metadata={"lorem": lorem})

    assert my_schema.to_string() == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'
-- schema metadata --
lorem: '""" + lorem[:65] + "' + " + str(len(lorem) - 65)

    # Metadata that exactly fits
    result = pa.schema([('f0', 'int32')],
                       metadata={'key': 'value' + 'x' * 62}).to_string()
    assert result == """\
f0: int32
-- schema metadata --
key: 'valuexxxxxxxxxxxxxxxxxxxxxxxxxxxxx\
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'"""

    assert my_schema.to_string(truncate_metadata=False) == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'
-- schema metadata --
lorem: '{}'""".format(lorem)

    assert my_schema.to_string(truncate_metadata=False,
                               show_field_metadata=False) == """\
foo: int32 not null
bar: string
-- schema metadata --
lorem: '{}'""".format(lorem)

    assert my_schema.to_string(truncate_metadata=False,
                               show_schema_metadata=False) == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'"""

    assert my_schema.to_string(truncate_metadata=False,
                               show_field_metadata=False,
                               show_schema_metadata=False) == """\
foo: int32 not null
bar: string"""


def test_schema_from_tuples():
    fields = [
        ('foo', pa.int32()),
        ('bar', pa.string()),
        ('baz', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([('foo', None)])


def test_schema_from_mapping():
    fields = OrderedDict([
        ('foo', pa.int32()),
        ('bar', pa.string()),
        ('baz', pa.list_(pa.int8())),
    ])
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    fields = OrderedDict([('foo', None)])
    with pytest.raises(TypeError):
        pa.schema(fields)


def test_schema_duplicate_fields():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('foo', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'foo']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
foo: list<item: int8>
  child 0, item: int8"""

    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    with pytest.warns(FutureWarning):
        assert sch.field_by_name('bar') == fields[1]
    with pytest.warns(FutureWarning):
        assert sch.field_by_name('xxx') is None
    with pytest.warns((UserWarning, FutureWarning)):
        assert sch.field_by_name('foo') is None

    # Schema::GetFieldIndex
    assert sch.get_field_index('foo') == -1

    # Schema::GetAllFieldIndices
    assert sch.get_all_field_indices('foo') == [0, 2]


def test_field_flatten():
    f0 = pa.field('foo', pa.int32()).with_metadata({b'foo': b'bar'})
    assert f0.flatten() == [f0]

    f1 = pa.field('bar', pa.float64(), nullable=False)
    ff = pa.field('ff', pa.struct([f0, f1]), nullable=False)
    assert ff.flatten() == [
        pa.field('ff.foo', pa.int32()).with_metadata({b'foo': b'bar'}),
        pa.field('ff.bar', pa.float64(), nullable=False)]  # XXX

    # Nullable parent makes flattened child nullable
    ff = pa.field('ff', pa.struct([f0, f1]))
    assert ff.flatten() == [
        pa.field('ff.foo', pa.int32()).with_metadata({b'foo': b'bar'}),
        pa.field('ff.bar', pa.float64())]

    fff = pa.field('fff', pa.struct([ff]))
    assert fff.flatten() == [pa.field('fff.ff', pa.struct([f0, f1]))]


def test_schema_add_remove_metadata():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]

    s1 = pa.schema(fields)

    assert s1.metadata is None

    metadata = {b'foo': b'bar', b'pandas': b'badger'}

    s2 = s1.with_metadata(metadata)
    assert s2.metadata == metadata

    s3 = s2.remove_metadata()
    assert s3.metadata is None

    # idempotent
    s4 = s3.remove_metadata()
    assert s4.metadata is None


def test_schema_equals():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    metadata = {b'foo': b'bar', b'pandas': b'badger'}

    sch1 = pa.schema(fields)
    sch2 = pa.schema(fields)
    sch3 = pa.schema(fields, metadata=metadata)
    sch4 = pa.schema(fields, metadata=metadata)

    assert sch1.equals(sch2, check_metadata=True)
    assert sch3.equals(sch4, check_metadata=True)
    assert sch1.equals(sch3)
    assert not sch1.equals(sch3, check_metadata=True)
    assert not sch1.equals(sch3, check_metadata=True)

    del fields[-1]
    sch3 = pa.schema(fields)
    assert not sch1.equals(sch3)


def test_schema_equals_propagates_check_metadata():
    # ARROW-4088
    schema1 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string())
    ])
    schema2 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string(), metadata={'a': 'alpha'}),
    ])
    assert not schema1.equals(schema2, check_metadata=True)
    assert schema1.equals(schema2)


def test_schema_equals_invalid_type():
    # ARROW-5873
    schema = pa.schema([pa.field("a", pa.int64())])

    for val in [None, 'string', pa.array([1, 2])]:
        with pytest.raises(TypeError):
            schema.equals(val)


def test_schema_equality_operators():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    metadata = {b'foo': b'bar', b'pandas': b'badger'}

    sch1 = pa.schema(fields)
    sch2 = pa.schema(fields)
    sch3 = pa.schema(fields, metadata=metadata)
    sch4 = pa.schema(fields, metadata=metadata)

    assert sch1 == sch2
    assert sch3 == sch4

    # __eq__ and __ne__ do not check metadata
    assert sch1 == sch3
    assert not sch1 != sch3

    assert sch2 == sch4

    # comparison with other types doesn't raise
    assert sch1 != []
    assert sch3 != 'foo'


def test_schema_get_fields():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]

    schema = pa.schema(fields)

    assert schema.field('foo').name == 'foo'
    assert schema.field(0).name == 'foo'
    assert schema.field(-1).name == 'baz'

    with pytest.raises(KeyError):
        schema.field('other')
    with pytest.raises(TypeError):
        schema.field(0.0)
    with pytest.raises(IndexError):
        schema.field(4)


def test_schema_negative_indexing():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]

    schema = pa.schema(fields)

    assert schema[-1].equals(schema[2])
    assert schema[-2].equals(schema[1])
    assert schema[-3].equals(schema[0])

    with pytest.raises(IndexError):
        schema[-4]

    with pytest.raises(IndexError):
        schema[3]


def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected


def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.map_(pa.string(), pa.int8()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.union([
            pa.field('a', pa.int8()),
            pa.field('b', pa.int16())
        ], pa.lib.UnionMode_SPARSE),
        pa.union([
            pa.field('a', pa.int8()),
            pa.field('b', pa.int16())
        ], pa.lib.UnionMode_DENSE),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal128(12, 2),
        pa.decimal256(76, 38),
        pa.field('a', 'string', metadata={b'foo': b'bar'}),
        pa.list_(pa.field("element", pa.int64())),
        pa.large_list(pa.field("element", pa.int64())),
        pa.map_(pa.field("key", pa.string(), nullable=False),
                pa.field("value", pa.int8()))
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped


def test_empty_table():
    schema1 = pa.schema([
        pa.field('f0', pa.int64()),
        pa.field('f1', pa.dictionary(pa.int32(), pa.string())),
        pa.field('f2', pa.list_(pa.list_(pa.int64()))),
    ])
    # test it preserves field nullability
    schema2 = pa.schema([
        pa.field('a', pa.int64(), nullable=False),
        pa.field('b', pa.int64())
    ])

    for schema in [schema1, schema2]:
        table = schema.empty_table()
        assert isinstance(table, pa.Table)
        assert table.num_rows == 0
        assert table.schema == schema


@pytest.mark.pandas
def test_schema_from_pandas():
    import pandas as pd
    inputs = [
        list(range(10)),
        pd.Categorical(list(range(10))),
        ['foo', 'bar', None, 'baz', 'qux'],
        np.array([
            '2007-07-13T01:23:34.123456789',
            '2006-01-13T12:34:56.432539784',
            '2010-08-13T05:46:57.437699912'
        ], dtype='datetime64[ns]'),
    ]
    if Version(pd.__version__) >= Version('1.0.0'):
        inputs.append(pd.array([1, 2, None], dtype=pd.Int32Dtype()))
    for data in inputs:
        df = pd.DataFrame({'a': data})
        schema = pa.Schema.from_pandas(df)
        expected = pa.Table.from_pandas(df).schema
        assert schema == expected


def test_schema_sizeof():
    schema = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
    ])

    assert sys.getsizeof(schema) > 30

    schema2 = schema.with_metadata({"key": "some metadata"})
    assert sys.getsizeof(schema2) > sys.getsizeof(schema)
    schema3 = schema.with_metadata({"key": "some more metadata"})
    assert sys.getsizeof(schema3) > sys.getsizeof(schema2)


def test_schema_merge():
    a = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ])
    b = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('qux', pa.bool_())
    ])
    c = pa.schema([
        pa.field('quux', pa.dictionary(pa.int32(), pa.string()))
    ])
    d = pa.schema([
        pa.field('foo', pa.int64()),
        pa.field('qux', pa.bool_())
    ])

    result = pa.unify_schemas([a, b, c])
    expected = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8())),
        pa.field('qux', pa.bool_()),
        pa.field('quux', pa.dictionary(pa.int32(), pa.string()))
    ])
    assert result.equals(expected)

    with pytest.raises(pa.ArrowInvalid):
        pa.unify_schemas([b, d])

    # ARROW-14002: Try with tuple instead of list
    result = pa.unify_schemas((a, b, c))
    assert result.equals(expected)


def test_undecodable_metadata():
    # ARROW-10214: undecodable metadata shouldn't fail repr()
    data1 = b'abcdef\xff\x00'
    data2 = b'ghijkl\xff\x00'
    schema = pa.schema(
        [pa.field('ints', pa.int16(), metadata={'key': data1})],
        metadata={'key': data2})
    assert 'abcdef' in str(schema)
    assert 'ghijkl' in str(schema)
Commit	Line	Data
1d09f67e TL	1	# Licensed to the Apache Software Foundation (ASF) under one
	2	# or more contributor license agreements. See the NOTICE file
	3	# distributed with this work for additional information
	4	# regarding copyright ownership. The ASF licenses this file
	5	# to you under the Apache License, Version 2.0 (the
	6	# "License"); you may not use this file except in compliance
	7	# with the License. You may obtain a copy of the License at
	8	#
	9	# http://www.apache.org/licenses/LICENSE-2.0
	10	#
	11	# Unless required by applicable law or agreed to in writing,
	12	# software distributed under the License is distributed on an
	13	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	14	# KIND, either express or implied. See the License for the
	15	# specific language governing permissions and limitations
	16	# under the License.
	17
	18	from collections import OrderedDict
	19	import pickle
	20	import sys
	21	import weakref
	22
	23	import pytest
	24	import numpy as np
	25	import pyarrow as pa
	26
	27	import pyarrow.tests.util as test_util
	28	from pyarrow.vendored.version import Version
	29
	30
	31	def test_schema_constructor_errors():
	32	msg = ("Do not call Schema's constructor directly, use `pyarrow.schema` "
	33	"instead")
	34	with pytest.raises(TypeError, match=msg):
	35	pa.Schema()
	36
	37
	38	def test_type_integers():
	39	dtypes = ['int8', 'int16', 'int32', 'int64',
	40	'uint8', 'uint16', 'uint32', 'uint64']
	41
	42	for name in dtypes:
	43	factory = getattr(pa, name)
	44	t = factory()
	45	assert str(t) == name
	46
	47
	48	def test_type_to_pandas_dtype():
	49	M8_ns = np.dtype('datetime64[ns]')
	50	cases = [
	51	(pa.null(), np.object_),
	52	(pa.bool_(), np.bool_),
	53	(pa.int8(), np.int8),
	54	(pa.int16(), np.int16),
	55	(pa.int32(), np.int32),
	56	(pa.int64(), np.int64),
	57	(pa.uint8(), np.uint8),
	58	(pa.uint16(), np.uint16),
	59	(pa.uint32(), np.uint32),
	60	(pa.uint64(), np.uint64),
	61	(pa.float16(), np.float16),
	62	(pa.float32(), np.float32),
	63	(pa.float64(), np.float64),
	64	(pa.date32(), M8_ns),
65	(pa.date64(), M8_ns),
66	(pa.timestamp('ms'), M8_ns),
67	(pa.binary(), np.object_),
68	(pa.binary(12), np.object_),
69	(pa.string(), np.object_),
70	(pa.list_(pa.int8()), np.object_),
71	# (pa.list_(pa.int8(), 2), np.object_), # TODO needs pandas conversion
72	(pa.map_(pa.int64(), pa.float64()), np.object_),
73	]
74	for arrow_type, numpy_type in cases:
75	assert arrow_type.to_pandas_dtype() == numpy_type
76
77
78	@pytest.mark.pandas
79	def test_type_to_pandas_dtype_check_import():
80	# ARROW-7980
81	test_util.invoke_script('arrow_7980.py')
82
83
84	def test_type_list():
85	value_type = pa.int32()
86	list_type = pa.list_(value_type)
87	assert str(list_type) == 'list<item: int32>'
88
89	field = pa.field('my_item', pa.string())
90	l2 = pa.list_(field)
91	assert str(l2) == 'list<my_item: string>'
92
93
94	def test_type_comparisons():
95	val = pa.int32()
96	assert val == pa.int32()
97	assert val == 'int32'
98	assert val != 5
99
100
101	def test_type_for_alias():
102	cases = [
103	('i1', pa.int8()),
104	('int8', pa.int8()),
105	('i2', pa.int16()),
106	('int16', pa.int16()),
107	('i4', pa.int32()),
108	('int32', pa.int32()),
109	('i8', pa.int64()),
110	('int64', pa.int64()),
111	('u1', pa.uint8()),
112	('uint8', pa.uint8()),
113	('u2', pa.uint16()),
114	('uint16', pa.uint16()),
115	('u4', pa.uint32()),
116	('uint32', pa.uint32()),
117	('u8', pa.uint64()),
118	('uint64', pa.uint64()),
119	('f4', pa.float32()),
120	('float32', pa.float32()),
121	('f8', pa.float64()),
122	('float64', pa.float64()),
123	('date32', pa.date32()),
124	('date64', pa.date64()),
125	('string', pa.string()),
126	('str', pa.string()),
127	('binary', pa.binary()),
128	('time32[s]', pa.time32('s')),
129	('time32[ms]', pa.time32('ms')),
130	('time64[us]', pa.time64('us')),
131	('time64[ns]', pa.time64('ns')),
132	('timestamp[s]', pa.timestamp('s')),
133	('timestamp[ms]', pa.timestamp('ms')),
134	('timestamp[us]', pa.timestamp('us')),
135	('timestamp[ns]', pa.timestamp('ns')),
136	('duration[s]', pa.duration('s')),
137	('duration[ms]', pa.duration('ms')),
138	('duration[us]', pa.duration('us')),
139	('duration[ns]', pa.duration('ns')),
140	('month_day_nano_interval', pa.month_day_nano_interval()),
141	]
142
143	for val, expected in cases:
144	assert pa.type_for_alias(val) == expected
145
146
147	def test_type_string():
148	t = pa.string()
149	assert str(t) == 'string'
150
151
152	def test_type_timestamp_with_tz():
153	tz = 'America/Los_Angeles'
154	t = pa.timestamp('ns', tz=tz)
155	assert t.unit == 'ns'
156	assert t.tz == tz
157
158
159	def test_time_types():
160	t1 = pa.time32('s')
161	t2 = pa.time32('ms')
162	t3 = pa.time64('us')
163	t4 = pa.time64('ns')
164
165	assert t1.unit == 's'
166	assert t2.unit == 'ms'
167	assert t3.unit == 'us'
168	assert t4.unit == 'ns'
169
170	assert str(t1) == 'time32[s]'
171	assert str(t4) == 'time64[ns]'
172
173	with pytest.raises(ValueError):
174	pa.time32('us')
175
176	with pytest.raises(ValueError):
177	pa.time64('s')
178
179
180	def test_from_numpy_dtype():
181	cases = [
182	(np.dtype('bool'), pa.bool_()),
183	(np.dtype('int8'), pa.int8()),
184	(np.dtype('int16'), pa.int16()),
185	(np.dtype('int32'), pa.int32()),
186	(np.dtype('int64'), pa.int64()),
187	(np.dtype('uint8'), pa.uint8()),
188	(np.dtype('uint16'), pa.uint16()),
189	(np.dtype('uint32'), pa.uint32()),
190	(np.dtype('float16'), pa.float16()),
191	(np.dtype('float32'), pa.float32()),
192	(np.dtype('float64'), pa.float64()),
193	(np.dtype('U'), pa.string()),
194	(np.dtype('S'), pa.binary()),
195	(np.dtype('datetime64[s]'), pa.timestamp('s')),
196	(np.dtype('datetime64[ms]'), pa.timestamp('ms')),
197	(np.dtype('datetime64[us]'), pa.timestamp('us')),
198	(np.dtype('datetime64[ns]'), pa.timestamp('ns')),
199	(np.dtype('timedelta64[s]'), pa.duration('s')),
200	(np.dtype('timedelta64[ms]'), pa.duration('ms')),
201	(np.dtype('timedelta64[us]'), pa.duration('us')),
202	(np.dtype('timedelta64[ns]'), pa.duration('ns')),
203	]
204
205	for dt, pt in cases:
206	result = pa.from_numpy_dtype(dt)
207	assert result == pt
208
209	# Things convertible to numpy dtypes work
210	assert pa.from_numpy_dtype('U') == pa.string()
211	assert pa.from_numpy_dtype(np.str_) == pa.string()
212	assert pa.from_numpy_dtype('int32') == pa.int32()
213	assert pa.from_numpy_dtype(bool) == pa.bool_()
214
215	with pytest.raises(NotImplementedError):
216	pa.from_numpy_dtype(np.dtype('O'))
217
218	with pytest.raises(TypeError):
219	pa.from_numpy_dtype('not_convertible_to_dtype')
220
221
222	def test_schema():
223	fields = [
224	pa.field('foo', pa.int32()),
225	pa.field('bar', pa.string()),
226	pa.field('baz', pa.list_(pa.int8()))
227	]
228	sch = pa.schema(fields)
229
230	assert sch.names == ['foo', 'bar', 'baz']
231	assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
232
233	assert len(sch) == 3
234	assert sch[0].name == 'foo'
235	assert sch[0].type == fields[0].type
236	assert sch.field('foo').name == 'foo'
237	assert sch.field('foo').type == fields[0].type
238
239	assert repr(sch) == """\
240	foo: int32
241	bar: string
242	baz: list<item: int8>
243	child 0, item: int8"""
244
245	with pytest.raises(TypeError):
246	pa.schema([None])
247
248
249	def test_schema_weakref():
250	fields = [
251	pa.field('foo', pa.int32()),
252	pa.field('bar', pa.string()),
253	pa.field('baz', pa.list_(pa.int8()))
254	]
255	schema = pa.schema(fields)
256	wr = weakref.ref(schema)
257	assert wr() is not None
258	del schema
259	assert wr() is None
260
261
262	def test_schema_to_string_with_metadata():
263	lorem = """\
264	Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan vel
265	turpis et mollis. Aliquam tincidunt arcu id tortor blandit blandit. Donec
266	eget leo quis lectus scelerisque varius. Class aptent taciti sociosqu ad
267	litora torquent per conubia nostra, per inceptos himenaeos. Praesent
268	faucibus, diam eu volutpat iaculis, tellus est porta ligula, a efficitur
269	turpis nulla facilisis quam. Aliquam vitae lorem erat. Proin a dolor ac libero
270	dignissim mollis vitae eu mauris. Quisque posuere tellus vitae massa
271	pellentesque sagittis. Aenean feugiat, diam ac dignissim fermentum, lorem
272	sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit
273	sapien. Quisque pretium vestibulum urna eu vehicula."""
274	# ARROW-7063
275	my_schema = pa.schema([pa.field("foo", "int32", False,
276	metadata={"key1": "value1"}),
277	pa.field("bar", "string", True,
278	metadata={"key3": "value3"})],
279	metadata={"lorem": lorem})
280
281	assert my_schema.to_string() == """\
282	foo: int32 not null
283	-- field metadata --
284	key1: 'value1'
285	bar: string
286	-- field metadata --
287	key3: 'value3'
288	-- schema metadata --
289	lorem: '""" + lorem[:65] + "' + " + str(len(lorem) - 65)
290
291	# Metadata that exactly fits
292	result = pa.schema([('f0', 'int32')],
293	metadata={'key': 'value' + 'x' * 62}).to_string()
294	assert result == """\
295	f0: int32
296	-- schema metadata --
297	key: 'valuexxxxxxxxxxxxxxxxxxxxxxxxxxxxx\
298	xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'"""
299
300	assert my_schema.to_string(truncate_metadata=False) == """\
301	foo: int32 not null
302	-- field metadata --
303	key1: 'value1'
304	bar: string
305	-- field metadata --
306	key3: 'value3'
307	-- schema metadata --
308	lorem: '{}'""".format(lorem)
309
310	assert my_schema.to_string(truncate_metadata=False,
311	show_field_metadata=False) == """\
312	foo: int32 not null
313	bar: string
314	-- schema metadata --
315	lorem: '{}'""".format(lorem)
316
317	assert my_schema.to_string(truncate_metadata=False,
318	show_schema_metadata=False) == """\
319	foo: int32 not null
320	-- field metadata --
321	key1: 'value1'
322	bar: string
323	-- field metadata --
324	key3: 'value3'"""
325
326	assert my_schema.to_string(truncate_metadata=False,
327	show_field_metadata=False,
328	show_schema_metadata=False) == """\
329	foo: int32 not null
330	bar: string"""
331
332
333	def test_schema_from_tuples():
334	fields = [
335	('foo', pa.int32()),
336	('bar', pa.string()),
337	('baz', pa.list_(pa.int8())),
338	]
339	sch = pa.schema(fields)
340	assert sch.names == ['foo', 'bar', 'baz']
341	assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
342	assert len(sch) == 3
343	assert repr(sch) == """\
344	foo: int32
345	bar: string
346	baz: list<item: int8>
347	child 0, item: int8"""
348
349	with pytest.raises(TypeError):
350	pa.schema([('foo', None)])
351
352
353	def test_schema_from_mapping():
354	fields = OrderedDict([
355	('foo', pa.int32()),
356	('bar', pa.string()),
357	('baz', pa.list_(pa.int8())),
358	])
359	sch = pa.schema(fields)
360	assert sch.names == ['foo', 'bar', 'baz']
361	assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
362	assert len(sch) == 3
363	assert repr(sch) == """\
364	foo: int32
365	bar: string
366	baz: list<item: int8>
367	child 0, item: int8"""
368
369	fields = OrderedDict([('foo', None)])
370	with pytest.raises(TypeError):
371	pa.schema(fields)
372
373
374	def test_schema_duplicate_fields():
375	fields = [
376	pa.field('foo', pa.int32()),
377	pa.field('bar', pa.string()),
378	pa.field('foo', pa.list_(pa.int8())),
379	]
380	sch = pa.schema(fields)
381	assert sch.names == ['foo', 'bar', 'foo']
382	assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
383	assert len(sch) == 3
384	assert repr(sch) == """\
385	foo: int32
386	bar: string
387	foo: list<item: int8>
388	child 0, item: int8"""
389
390	assert sch[0].name == 'foo'
391	assert sch[0].type == fields[0].type
392	with pytest.warns(FutureWarning):
393	assert sch.field_by_name('bar') == fields[1]
394	with pytest.warns(FutureWarning):
395	assert sch.field_by_name('xxx') is None
396	with pytest.warns((UserWarning, FutureWarning)):
397	assert sch.field_by_name('foo') is None
398
399	# Schema::GetFieldIndex
400	assert sch.get_field_index('foo') == -1
401
402	# Schema::GetAllFieldIndices
403	assert sch.get_all_field_indices('foo') == [0, 2]
404
405
406	def test_field_flatten():
407	f0 = pa.field('foo', pa.int32()).with_metadata({b'foo': b'bar'})
408	assert f0.flatten() == [f0]
409
410	f1 = pa.field('bar', pa.float64(), nullable=False)
411	ff = pa.field('ff', pa.struct([f0, f1]), nullable=False)
412	assert ff.flatten() == [
413	pa.field('ff.foo', pa.int32()).with_metadata({b'foo': b'bar'}),
414	pa.field('ff.bar', pa.float64(), nullable=False)] # XXX
415
416	# Nullable parent makes flattened child nullable
417	ff = pa.field('ff', pa.struct([f0, f1]))
418	assert ff.flatten() == [
419	pa.field('ff.foo', pa.int32()).with_metadata({b'foo': b'bar'}),
420	pa.field('ff.bar', pa.float64())]
421
422	fff = pa.field('fff', pa.struct([ff]))
423	assert fff.flatten() == [pa.field('fff.ff', pa.struct([f0, f1]))]
424
425
426	def test_schema_add_remove_metadata():
427	fields = [
428	pa.field('foo', pa.int32()),
429	pa.field('bar', pa.string()),
430	pa.field('baz', pa.list_(pa.int8()))
431	]
432
433	s1 = pa.schema(fields)
434
435	assert s1.metadata is None
436
437	metadata = {b'foo': b'bar', b'pandas': b'badger'}
438
439	s2 = s1.with_metadata(metadata)
440	assert s2.metadata == metadata
441
442	s3 = s2.remove_metadata()
443	assert s3.metadata is None
444
445	# idempotent
446	s4 = s3.remove_metadata()
447	assert s4.metadata is None
448
449
450	def test_schema_equals():
451	fields = [
452	pa.field('foo', pa.int32()),
453	pa.field('bar', pa.string()),
454	pa.field('baz', pa.list_(pa.int8()))
455	]
456	metadata = {b'foo': b'bar', b'pandas': b'badger'}
457
458	sch1 = pa.schema(fields)
459	sch2 = pa.schema(fields)
460	sch3 = pa.schema(fields, metadata=metadata)
461	sch4 = pa.schema(fields, metadata=metadata)
462
463	assert sch1.equals(sch2, check_metadata=True)
464	assert sch3.equals(sch4, check_metadata=True)
465	assert sch1.equals(sch3)
466	assert not sch1.equals(sch3, check_metadata=True)
467	assert not sch1.equals(sch3, check_metadata=True)
468
469	del fields[-1]
470	sch3 = pa.schema(fields)
471	assert not sch1.equals(sch3)
472
473
474	def test_schema_equals_propagates_check_metadata():
475	# ARROW-4088
476	schema1 = pa.schema([
477	pa.field('foo', pa.int32()),
478	pa.field('bar', pa.string())
479	])
480	schema2 = pa.schema([
481	pa.field('foo', pa.int32()),
482	pa.field('bar', pa.string(), metadata={'a': 'alpha'}),
483	])
484	assert not schema1.equals(schema2, check_metadata=True)
485	assert schema1.equals(schema2)
486
487
488	def test_schema_equals_invalid_type():
489	# ARROW-5873
490	schema = pa.schema([pa.field("a", pa.int64())])
491
492	for val in [None, 'string', pa.array([1, 2])]:
493	with pytest.raises(TypeError):
494	schema.equals(val)
495
496
497	def test_schema_equality_operators():
498	fields = [
499	pa.field('foo', pa.int32()),
500	pa.field('bar', pa.string()),
501	pa.field('baz', pa.list_(pa.int8()))
502	]
503	metadata = {b'foo': b'bar', b'pandas': b'badger'}
504
505	sch1 = pa.schema(fields)
506	sch2 = pa.schema(fields)
507	sch3 = pa.schema(fields, metadata=metadata)
508	sch4 = pa.schema(fields, metadata=metadata)
509
510	assert sch1 == sch2
511	assert sch3 == sch4
512
513	# __eq__ and __ne__ do not check metadata
514	assert sch1 == sch3
515	assert not sch1 != sch3
516
517	assert sch2 == sch4
518
519	# comparison with other types doesn't raise
520	assert sch1 != []
521	assert sch3 != 'foo'
522
523
524	def test_schema_get_fields():
525	fields = [
526	pa.field('foo', pa.int32()),
527	pa.field('bar', pa.string()),
528	pa.field('baz', pa.list_(pa.int8()))
529	]
530
531	schema = pa.schema(fields)
532
533	assert schema.field('foo').name == 'foo'
534	assert schema.field(0).name == 'foo'
535	assert schema.field(-1).name == 'baz'
536
537	with pytest.raises(KeyError):
538	schema.field('other')
539	with pytest.raises(TypeError):
540	schema.field(0.0)
541	with pytest.raises(IndexError):
542	schema.field(4)
543
544
545	def test_schema_negative_indexing():
546	fields = [
547	pa.field('foo', pa.int32()),
548	pa.field('bar', pa.string()),
549	pa.field('baz', pa.list_(pa.int8()))
550	]
551
552	schema = pa.schema(fields)
553
554	assert schema[-1].equals(schema[2])
555	assert schema[-2].equals(schema[1])
556	assert schema[-3].equals(schema[0])
557
558	with pytest.raises(IndexError):
559	schema[-4]
560
561	with pytest.raises(IndexError):
562	schema[3]
563
564
565	def test_schema_repr_with_dictionaries():
566	fields = [
567	pa.field('one', pa.dictionary(pa.int16(), pa.string())),
568	pa.field('two', pa.int32())
569	]
570	sch = pa.schema(fields)
571
572	expected = (
573	"""\
574	one: dictionary<values=string, indices=int16, ordered=0>
575	two: int32""")
576
577	assert repr(sch) == expected
578
579
580	def test_type_schema_pickling():
581	cases = [
582	pa.int8(),
583	pa.string(),
584	pa.binary(),
585	pa.binary(10),
586	pa.list_(pa.string()),
587	pa.map_(pa.string(), pa.int8()),
588	pa.struct([
589	pa.field('a', 'int8'),
590	pa.field('b', 'string')
591	]),
592	pa.union([
593	pa.field('a', pa.int8()),
594	pa.field('b', pa.int16())
595	], pa.lib.UnionMode_SPARSE),
596	pa.union([
597	pa.field('a', pa.int8()),
598	pa.field('b', pa.int16())
599	], pa.lib.UnionMode_DENSE),
600	pa.time32('s'),
601	pa.time64('us'),
602	pa.date32(),
603	pa.date64(),
604	pa.timestamp('ms'),
605	pa.timestamp('ns'),
606	pa.decimal128(12, 2),
607	pa.decimal256(76, 38),
608	pa.field('a', 'string', metadata={b'foo': b'bar'}),
609	pa.list_(pa.field("element", pa.int64())),
610	pa.large_list(pa.field("element", pa.int64())),
611	pa.map_(pa.field("key", pa.string(), nullable=False),
612	pa.field("value", pa.int8()))
613	]
614
615	for val in cases:
616	roundtripped = pickle.loads(pickle.dumps(val))
617	assert val == roundtripped
618
619	fields = []
620	for i, f in enumerate(cases):
621	if isinstance(f, pa.Field):
622	fields.append(f)
623	else:
624	fields.append(pa.field('_f{}'.format(i), f))
625
626	schema = pa.schema(fields, metadata={b'foo': b'bar'})
627	roundtripped = pickle.loads(pickle.dumps(schema))
628	assert schema == roundtripped
629
630
631	def test_empty_table():
632	schema1 = pa.schema([
633	pa.field('f0', pa.int64()),
634	pa.field('f1', pa.dictionary(pa.int32(), pa.string())),
635	pa.field('f2', pa.list_(pa.list_(pa.int64()))),
636	])
637	# test it preserves field nullability
638	schema2 = pa.schema([
639	pa.field('a', pa.int64(), nullable=False),
640	pa.field('b', pa.int64())
641	])
642
643	for schema in [schema1, schema2]:
644	table = schema.empty_table()
645	assert isinstance(table, pa.Table)
646	assert table.num_rows == 0
647	assert table.schema == schema
648
649
650	@pytest.mark.pandas
651	def test_schema_from_pandas():
652	import pandas as pd
653	inputs = [
654	list(range(10)),
655	pd.Categorical(list(range(10))),
656	['foo', 'bar', None, 'baz', 'qux'],
657	np.array([
658	'2007-07-13T01:23:34.123456789',
659	'2006-01-13T12:34:56.432539784',
660	'2010-08-13T05:46:57.437699912'
661	], dtype='datetime64[ns]'),
662	]
663	if Version(pd.__version__) >= Version('1.0.0'):
664	inputs.append(pd.array([1, 2, None], dtype=pd.Int32Dtype()))
665	for data in inputs:
666	df = pd.DataFrame({'a': data})
667	schema = pa.Schema.from_pandas(df)
668	expected = pa.Table.from_pandas(df).schema
669	assert schema == expected
670
671
672	def test_schema_sizeof():
673	schema = pa.schema([
674	pa.field('foo', pa.int32()),
675	pa.field('bar', pa.string()),
676	])
677
678	assert sys.getsizeof(schema) > 30
679
680	schema2 = schema.with_metadata({"key": "some metadata"})
681	assert sys.getsizeof(schema2) > sys.getsizeof(schema)
682	schema3 = schema.with_metadata({"key": "some more metadata"})
683	assert sys.getsizeof(schema3) > sys.getsizeof(schema2)
684
685
686	def test_schema_merge():
687	a = pa.schema([
688	pa.field('foo', pa.int32()),
689	pa.field('bar', pa.string()),
690	pa.field('baz', pa.list_(pa.int8()))
691	])
692	b = pa.schema([
693	pa.field('foo', pa.int32()),
694	pa.field('qux', pa.bool_())
695	])
696	c = pa.schema([
697	pa.field('quux', pa.dictionary(pa.int32(), pa.string()))
698	])
699	d = pa.schema([
700	pa.field('foo', pa.int64()),
701	pa.field('qux', pa.bool_())
702	])
703
704	result = pa.unify_schemas([a, b, c])
705	expected = pa.schema([
706	pa.field('foo', pa.int32()),
707	pa.field('bar', pa.string()),
708	pa.field('baz', pa.list_(pa.int8())),
709	pa.field('qux', pa.bool_()),
710	pa.field('quux', pa.dictionary(pa.int32(), pa.string()))
711	])
712	assert result.equals(expected)
713
714	with pytest.raises(pa.ArrowInvalid):
715	pa.unify_schemas([b, d])
716
717	# ARROW-14002: Try with tuple instead of list
718	result = pa.unify_schemas((a, b, c))
719	assert result.equals(expected)
720
721
722	def test_undecodable_metadata():
723	# ARROW-10214: undecodable metadata shouldn't fail repr()
724	data1 = b'abcdef\xff\x00'
725	data2 = b'ghijkl\xff\x00'
726	schema = pa.schema(
727	[pa.field('ints', pa.int16(), metadata={'key': data1})],
728	metadata={'key': data2})
729	assert 'abcdef' in str(schema)
730	assert 'ghijkl' in str(schema)