Source code for xorbits._mars.dataframe.base.to_numeric

# Copyright 2022-2023 XProbe Inc.
# derived from copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd

from ...core import ENTITY_TYPE, OutputType
from ...serialization.serializables import StringField
from ...tensor import tensor as astensor
from ...tensor.core import TENSOR_TYPE, TensorOrder
from ..core import SERIES_TYPE
from ..initializer import Series as asseries
from ..operands import DataFrameOperand, DataFrameOperandMixin


class DataFrameToNumeric(DataFrameOperand, DataFrameOperandMixin):
    errors = StringField("errors")
    downcast = StringField("downcast")

    def __init__(self, errors="raise", downcast=None, **kw):
        super().__init__(errors=errors, downcast=downcast, **kw)

    def __call__(self, arg):
        if isinstance(arg, pd.Series):
            arg = asseries(arg)
        elif not isinstance(arg, ENTITY_TYPE):
            arg = astensor(arg)
        if arg.ndim != 1:
            raise ValueError("Input array must be 1 dimensional")
        if arg.size == 0:
            raise ValueError("Input array can not be empty")

        if isinstance(arg, asseries):
            series = arg
            self.output_types = [OutputType.series]
            return self.new_series(
                [series],
                shape=series.shape,
                name=series.name,
                index_value=series.index_value,
                dtype=series.dtype,
            )
        else:
            tensor = arg
            self.output_types = [OutputType.tensor]
            dtype = tensor.dtype
            if dtype.kind == "U":
                dtype = np.dtype(object)
            return self.new_tileables([tensor], shape=tensor.shape, dtype=dtype)[0]

    @classmethod
    def tile(cls, op):
        in_df = op.inputs[0]
        out_df = op.outputs[0]

        out_chunks = []
        for in_chunk in in_df.chunks:
            out_op = op.copy().reset_key()
            chunk_kws = []
            if isinstance(out_df, SERIES_TYPE):
                chunk_kws.append(
                    {
                        "dtype": out_df.dtype,
                        "shape": in_chunk.shape,
                        "index": in_chunk.index,
                        "index_value": in_chunk.index_value,
                        "name": in_chunk.name,
                    }
                )
            elif isinstance(out_df, TENSOR_TYPE):
                chunk_kws.append(
                    {
                        "dtype": out_df.dtype,
                        "shape": in_chunk.shape,
                        "order": TensorOrder.C_ORDER,
                        "index": in_chunk.index,
                    }
                )
            out_chunks.append(out_op.new_chunk([in_chunk], kws=chunk_kws))

        new_op = op.copy()
        kw = out_df.params
        kw["nsplits"] = in_df.nsplits
        kw["chunks"] = out_chunks
        return new_op.new_tileables(op.inputs, kws=[kw])

    @classmethod
    def execute(cls, ctx, op):
        input_data = ctx[op.inputs[0].key]
        errors_ = op.errors
        downcast_ = op.downcast
        ctx[op.outputs[0].key] = pd.to_numeric(
            input_data, errors=errors_, downcast=downcast_
        )


[docs]def to_numeric(arg, errors="raise", downcast=None):
    """
    Convert argument to a numeric type.

    The default return dtype is `float64` or `int64`
    depending on the data supplied. Use the `downcast` parameter
    to obtain other dtypes.

    Please note that precision loss may occur if really large numbers
    are passed in. Due to the internal limitations of `ndarray`, if
    numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
    or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
    passed in, it is very likely they will be converted to float so that
    they can stored in an `ndarray`. These warnings apply similarly to
    `Series` since it internally leverages `ndarray`.

    Parameters
    ----------
    arg : scalar, list, tuple, 1-d array, or Series
        Argument to be converted.
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaN.
        - If 'ignore', then invalid parsing will return the input.
    downcast : {'integer', 'signed', 'unsigned', 'float'}, default None
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

    Returns
    -------
    ret
        Numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise Tensor.

    See Also
    --------
    DataFrame.astype : Cast argument to a specified dtype.
    to_datetime : Convert argument to datetime.
    to_timedelta : Convert argument to timedelta.
    numpy.ndarray.astype : Cast a numpy array to a specified type.
    DataFrame.convert_dtypes : Convert dtypes.

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> s = md.Series(['1.0', '2', -3])
    >>> md.to_numeric(s).execute()
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> md.to_numeric(s, downcast='float').execute()
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> md.to_numeric(s, downcast='signed').execute()
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = md.Series(['apple', '1.0', '2', -3])
    >>> md.to_numeric(s, errors='ignore').execute()
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> md.to_numeric(s, errors='coerce').execute()
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64

    Downcasting of nullable integer and floating dtypes is supported:

    >>> s = md.Series([1, 2, 3], dtype="int64")
    >>> md.to_numeric(s, downcast="integer").execute()
    0    1
    1    2
    2    3
    dtype: int8
    >>> s = md.Series([1.0, 2.1, 3.0], dtype="float64")
    >>> md.to_numeric(s, downcast="float").execute()
    0    1.0
    1    2.1
    2    3.0
    dtype: float32
    """
    if errors not in ("ignore", "raise", "coerce"):
        raise ValueError("invalid error value specified")
    if downcast not in (None, "integer", "signed", "unsigned", "float"):
        raise ValueError("invalid downcasting method provided")

    op = DataFrameToNumeric(errors=errors, downcast=downcast)
    return op(arg)