Source code for text_extensions_for_pandas.spanner.consolidate

#
#  Copyright (c) 2020 IBM Corp.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

#
# consolidate.py
#
# Variants of the Consolidate operator from spanner algebra.
# The Consolidate operator removes spans that conflict with other spans
# according to a consolidation policy.
#

import pandas as pd

from text_extensions_for_pandas.array.span import SpanArray


[docs]def consolidate(df: pd.DataFrame, on: str, how: str = "left_to_right") -> pd.DataFrame: """ Eliminate overlap among the spans in one column of a :class:`pd.DataFrame`. :param df: DataFrame containing spans and other attributes :param on: Name of column in `df` on which to perform consolidation :param how: What policy to use to decide what spans are considered to overlap and which of an overlapping pair will remain after consolidation. Available policies: * ``left_to_right``: Walk through the spans from left to right, keeping \ the longest non-overlapping match at each position encountered :returns: the rows of `df` that remain after applying the specified policy to the spans in the column specified by `on`. """ spans = df[on].values if not isinstance(spans, SpanArray): raise TypeError(f"Column '{on}' of dataframe is of type " f"{df[on].dtype}, which is not a span type.") if how != "left_to_right": raise ValueError(f"Receieved '{how}' for `how` argument, but " f"the only valid value for that argument is " f"'left_to_right'.") tmp = pd.DataFrame({ "span": spans, "begin": spans.begin, "end": spans.end, "ix": range(len(spans))} ).sort_values(["begin", "end"], ascending=[True, False]) # Slow-but-correct implementation for now ix_to_retain = [] # Type: List[int] iloc = 0 while iloc < len(tmp.index): # Loop invariants: # * iloc == location of a span that doesn't overlap with any # span in ix_to_retain # * All locations before iloc have been processed. # Since we sorted by end in DESCENDING order, the current span # is guaranteed to be the longest span that begins at its begin # offset. row = tmp.iloc[iloc] cur_end = row["end"] cur_ix = row["ix"] ix_to_retain.append(cur_ix) # Skip other spans that begin before this span ends while (iloc < len(tmp.index) and tmp.iloc[iloc]["begin"] < cur_end): iloc += 1 return df.iloc[ix_to_retain]