# Copyright 2022-2023 XProbe Inc.
# derived from copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd
from ... import opcodes
from ...core import recursive_tile
from ...serialization.serializables import AnyField, StringField
from ...utils import calc_nsplits
from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType
from ..utils import build_empty_df, parse_index, standardize_range_index
class DataFrameMelt(DataFrameOperand, DataFrameOperandMixin):
_op_type_ = opcodes.MELT
_id_vars = AnyField("id_vars")
_value_vars = AnyField("value_vars")
_var_name = StringField("var_name")
_value_name = StringField("value_name")
_col_level = AnyField("col_level")
def __init__(
self,
id_vars=None,
value_vars=None,
var_name=None,
value_name=None,
col_level=None,
**kw
):
super().__init__(
_id_vars=id_vars,
_value_vars=value_vars,
_var_name=var_name,
_value_name=value_name,
_col_level=col_level,
**kw
)
@property
def id_vars(self):
return self._id_vars
@property
def value_vars(self):
return self._value_vars
@property
def var_name(self):
return self._var_name
@property
def value_name(self):
return self._value_name
@property
def col_level(self):
return self._col_level
def __call__(self, df):
empty_result = build_empty_df(df.dtypes).melt(
id_vars=self.id_vars,
value_vars=self.value_vars,
var_name=self.var_name,
value_name=self.value_name,
col_level=self.col_level,
)
self._output_types = [OutputType.dataframe]
return self.new_tileable(
[df],
shape=(np.nan, len(empty_result.columns)),
dtypes=empty_result.dtypes,
index_value=parse_index(pd.RangeIndex(-1), df.key, df.index_value.key),
columns_value=parse_index(empty_result.columns, store_data=True),
)
@classmethod
def tile(cls, op: "DataFrameMelt"):
inp = op.inputs[0]
out = op.outputs[0]
inp = yield from recursive_tile(inp.rechunk({1: (inp.shape[1],)}))
chunks = []
for c in inp.chunks:
new_op = op.copy().reset_key()
chunks.append(
new_op.new_chunk(
[c],
index=c.index,
shape=(np.nan, out.shape[1]),
dtypes=out.dtypes,
index_value=parse_index(
pd.RangeIndex(-1), c.key, c.index_value.key
),
columns_value=out.columns_value,
)
)
yield chunks
chunks = standardize_range_index(chunks)
new_op = op.copy().reset_key()
return new_op.new_tileables(
[inp],
chunks=chunks,
nsplits=calc_nsplits({c.index: c.shape for c in chunks}),
**out.params
)
@classmethod
def execute(cls, ctx, op: "DataFrameMelt"):
in_data = ctx[op.inputs[0].key]
ctx[op.outputs[0].key] = in_data.melt(
id_vars=op.id_vars,
value_vars=op.value_vars,
var_name=op.var_name,
value_name=op.value_name,
col_level=op.col_level,
)
[docs]def melt(
frame,
id_vars=None,
value_vars=None,
var_name=None,
value_name="value",
col_level=None,
):
"""
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
This function is useful to massage a DataFrame into a format where one
or more columns are identifier variables (`id_vars`), while all other
columns, considered measured variables (`value_vars`), are "unpivoted" to
the row axis, leaving just two non-identifier columns, 'variable' and
'value'.
.. versionadded:: 0.20.0
Parameters
----------
id_vars : tuple, list, or ndarray, optional
Column(s) to use as identifier variables.
value_vars : tuple, list, or ndarray, optional
Column(s) to unpivot. If not specified, uses all columns that
are not set as `id_vars`.
var_name : scalar
Name to use for the 'variable' column. If None it uses
``frame.columns.name`` or 'variable'.
value_name : scalar, default 'value'
Name to use for the 'value' column.
col_level : int or str, optional
If columns are a MultiIndex then use this level to melt.
Returns
-------
DataFrame
Unpivoted DataFrame.
See Also
--------
melt
pivot_table
DataFrame.pivot
Series.explode
Examples
--------
>>> import mars.dataframe as md
>>> df = md.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
... 'B': {0: 1, 1: 3, 2: 5},
... 'C': {0: 2, 1: 4, 2: 6}})
>>> df.execute()
A B C
0 a 1 2
1 b 3 4
2 c 5 6
>>> df.melt(id_vars=['A'], value_vars=['B']).execute()
A variable value
0 a B 1
1 b B 3
2 c B 5
>>> df.melt(id_vars=['A'], value_vars=['B', 'C']).execute()
A variable value
0 a B 1
1 b B 3
2 c B 5
3 a C 2
4 b C 4
5 c C 6
The names of 'variable' and 'value' columns can be customized:
>>> df.melt(id_vars=['A'], value_vars=['B'],
... var_name='myVarname', value_name='myValname').execute()
A myVarname myValname
0 a B 1
1 b B 3
2 c B 5
If you have multi-index columns:
>>> df = md.DataFrame({('A', 'D'): {0: 'a', 1: 'b', 2: 'c'},
... ('B', 'E'): {0: 1, 1: 3, 2: 5},
... ('C', 'F'): {0: 2, 1: 4, 2: 6}})
>>> df.execute()
A B C
D E F
0 a 1 2
1 b 3 4
2 c 5 6
>>> df.melt(col_level=0, id_vars=['A'], value_vars=['B']).execute()
A variable value
0 a B 1
1 b B 3
2 c B 5
>>> df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')]).execute()
(A, D) variable_0 variable_1 value
0 a B E 1
1 b B E 3
2 c B E 5
"""
op = DataFrameMelt(
id_vars=id_vars,
value_vars=value_vars,
var_name=var_name,
value_name=value_name,
col_level=col_level,
)
return op(frame)