Source code for xorbits._mars.dataframe.base.melt

# Copyright 2022-2023 XProbe Inc.
# derived from copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd

from ... import opcodes
from ...core import recursive_tile
from ...serialization.serializables import AnyField, StringField
from ...utils import calc_nsplits
from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType
from ..utils import build_empty_df, parse_index, standardize_range_index


class DataFrameMelt(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = opcodes.MELT

    _id_vars = AnyField("id_vars")
    _value_vars = AnyField("value_vars")
    _var_name = StringField("var_name")
    _value_name = StringField("value_name")
    _col_level = AnyField("col_level")

    def __init__(
        self,
        id_vars=None,
        value_vars=None,
        var_name=None,
        value_name=None,
        col_level=None,
        **kw
    ):
        super().__init__(
            _id_vars=id_vars,
            _value_vars=value_vars,
            _var_name=var_name,
            _value_name=value_name,
            _col_level=col_level,
            **kw
        )

    @property
    def id_vars(self):
        return self._id_vars

    @property
    def value_vars(self):
        return self._value_vars

    @property
    def var_name(self):
        return self._var_name

    @property
    def value_name(self):
        return self._value_name

    @property
    def col_level(self):
        return self._col_level

    def __call__(self, df):
        empty_result = build_empty_df(df.dtypes).melt(
            id_vars=self.id_vars,
            value_vars=self.value_vars,
            var_name=self.var_name,
            value_name=self.value_name,
            col_level=self.col_level,
        )
        self._output_types = [OutputType.dataframe]
        return self.new_tileable(
            [df],
            shape=(np.nan, len(empty_result.columns)),
            dtypes=empty_result.dtypes,
            index_value=parse_index(pd.RangeIndex(-1), df.key, df.index_value.key),
            columns_value=parse_index(empty_result.columns, store_data=True),
        )

    @classmethod
    def tile(cls, op: "DataFrameMelt"):
        inp = op.inputs[0]
        out = op.outputs[0]

        inp = yield from recursive_tile(inp.rechunk({1: (inp.shape[1],)}))

        chunks = []
        for c in inp.chunks:
            new_op = op.copy().reset_key()
            chunks.append(
                new_op.new_chunk(
                    [c],
                    index=c.index,
                    shape=(np.nan, out.shape[1]),
                    dtypes=out.dtypes,
                    index_value=parse_index(
                        pd.RangeIndex(-1), c.key, c.index_value.key
                    ),
                    columns_value=out.columns_value,
                )
            )

        yield chunks
        chunks = standardize_range_index(chunks)
        new_op = op.copy().reset_key()
        return new_op.new_tileables(
            [inp],
            chunks=chunks,
            nsplits=calc_nsplits({c.index: c.shape for c in chunks}),
            **out.params
        )

    @classmethod
    def execute(cls, ctx, op: "DataFrameMelt"):
        in_data = ctx[op.inputs[0].key]
        ctx[op.outputs[0].key] = in_data.melt(
            id_vars=op.id_vars,
            value_vars=op.value_vars,
            var_name=op.var_name,
            value_name=op.value_name,
            col_level=op.col_level,
        )


[docs]def melt( frame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ): """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other columns, considered measured variables (`value_vars`), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. .. versionadded:: 0.20.0 Parameters ---------- id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. var_name : scalar Name to use for the 'variable' column. If None it uses ``frame.columns.name`` or 'variable'. value_name : scalar, default 'value' Name to use for the 'value' column. col_level : int or str, optional If columns are a MultiIndex then use this level to melt. Returns ------- DataFrame Unpivoted DataFrame. See Also -------- melt pivot_table DataFrame.pivot Series.explode Examples -------- >>> import mars.dataframe as md >>> df = md.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, ... 'B': {0: 1, 1: 3, 2: 5}, ... 'C': {0: 2, 1: 4, 2: 6}}) >>> df.execute() A B C 0 a 1 2 1 b 3 4 2 c 5 6 >>> df.melt(id_vars=['A'], value_vars=['B']).execute() A variable value 0 a B 1 1 b B 3 2 c B 5 >>> df.melt(id_vars=['A'], value_vars=['B', 'C']).execute() A variable value 0 a B 1 1 b B 3 2 c B 5 3 a C 2 4 b C 4 5 c C 6 The names of 'variable' and 'value' columns can be customized: >>> df.melt(id_vars=['A'], value_vars=['B'], ... var_name='myVarname', value_name='myValname').execute() A myVarname myValname 0 a B 1 1 b B 3 2 c B 5 If you have multi-index columns: >>> df = md.DataFrame({('A', 'D'): {0: 'a', 1: 'b', 2: 'c'}, ... ('B', 'E'): {0: 1, 1: 3, 2: 5}, ... ('C', 'F'): {0: 2, 1: 4, 2: 6}}) >>> df.execute() A B C D E F 0 a 1 2 1 b 3 4 2 c 5 6 >>> df.melt(col_level=0, id_vars=['A'], value_vars=['B']).execute() A variable value 0 a B 1 1 b B 3 2 c B 5 >>> df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')]).execute() (A, D) variable_0 variable_1 value 0 a B E 1 1 b B E 3 2 c B E 5 """ op = DataFrameMelt( id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name, col_level=col_level, ) return op(frame)