Source code for text_extensions_for_pandas.spanner.project

#
#  Copyright (c) 2020 IBM Corp.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

#
# project.py
#
# Projection functions (functions that take values from one tuple and return a
# scalar) for spans.
#

import pandas as pd

from typing import *

# Internal imports
from text_extensions_for_pandas.array.span import (
    SpanArray, Span
)
from text_extensions_for_pandas.array.token_span import (
    TokenSpanArray
)


[docs]def lemmatize(spans: Union[pd.Series, SpanArray, Iterable[Span]], token_features: pd.DataFrame, lemma_col_name: str = "lemma", token_span_col_name: str = "span") -> List[str]: """ Convert spans to their normal form using lemma information in a token features table. :param spans: Spans to be normalized. Each may represent zero or more tokens. :param token_features: DataFrame of token metadata. Index must be aligned with the token indices in `spans`. :param lemma_col_name: Optional custom name for the DataFrame column containing the lemmatized form of each token. :param token_span_col_name: Optional custom name for the DataFrame column containing the span of each token. :return: A list containing normalized versions of the tokens in `spans`, with each token separated by single space character. """ char_spans = SpanArray.make_array(spans) token_spans = TokenSpanArray.align_to_tokens(token_features[token_span_col_name], char_spans) ret = [] # Type: List[str] # TODO: Vectorize this loop for i in range(len(token_spans)): lemmas = token_features[lemma_col_name][ token_spans.begin_token[i]:token_spans.end_token[i] ] ret.append(" ".join(lemmas)) return ret