#
# Copyright (c) 2020 IBM Corp.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
# spacy.py
#
"""
The ``io.spacy`` module contains I/O functions related to the SpaCy_ NLP library.
.. _SpaCy: https://spacy.io
"""
import re
import string
import numpy as np
import pandas as pd
from text_extensions_for_pandas.array.span import (
SpanArray,
SpanDtype,
)
from text_extensions_for_pandas.array.token_span import (
TokenSpanArray,
TokenSpanDtype,
)
# To avoid creating an unnecessary dependency on SpaCy for non-SpaCy
# applications, we do NOT `import spacy` at the top level of this file,
# and we do NOT include type hints for SpaCy types in the function
# signatures below.
_SIMPLE_TOKENIZER = None
[docs]def simple_tokenizer() -> "spacy.tokenizer.Tokenizer":
"""
:returns: Singleton instance of a SpaCy tokenizer that splits text on all whitespace
and all punctuation characters.
This type of tokenization is recommended for dictionary and regular
expression matching.
"""
global _SIMPLE_TOKENIZER
if _SIMPLE_TOKENIZER is None:
# noinspection PyPackageRequirements
import spacy
punct_chars = re.escape(string.punctuation)
prefix_re = re.compile(f"^[{punct_chars}]")
suffix_re = re.compile(f"[{punct_chars}]$")
infix_re = re.compile(f"[{punct_chars}]")
empty_vocab = spacy.vocab.Vocab()
_SIMPLE_TOKENIZER = spacy.tokenizer.Tokenizer(empty_vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer)
return _SIMPLE_TOKENIZER
[docs]def make_tokens(target_text: str, tokenizer: "spacy.tokenizer.Tokenizer" = None) \
-> pd.Series:
"""
:param target_text: Text to tokenize
:param tokenizer: Preconfigured `spacy.tokenizer.Tokenizer` object, or None
to use the tokenizer returned by :func:`simple_tokenizer()`
:return: The tokens (and underlying text) as a Pandas Series wrapped around
a :class:`SpanArray` value.
"""
if tokenizer is None:
tokenizer = simple_tokenizer()
spacy_doc = tokenizer(target_text)
tok_begins = np.array([t.idx for t in spacy_doc])
tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
return pd.Series(SpanArray(target_text, tok_begins, tok_ends))
[docs]def make_tokens_and_features(
target_text: str, language_model, add_left_and_right=False,
) -> pd.DataFrame:
"""
:param target_text: Text to analyze
:param language_model: Preconfigured spaCy language model (`spacy.language.Language`)
object
:param add_left_and_right: If ``True``, add columns "left" and "right"
containing references to previous and next tokens.
:return: A tuple of two dataframes:
1. The tokens of the text plus additional linguistic features that the
language model generates, represented as a `pd.DataFrame`.
2. A table of named entities identified by the language model's named entity
tagger, represented as a `pd.DataFrame`.
"""
spacy_doc = language_model(target_text)
# TODO: Performance tuning of the translation code that follows
# Represent the character spans of the tokens
tok_begins = np.array([t.idx for t in spacy_doc])
tok_ends = np.array([t.idx + len(t) for t in spacy_doc])
tokens_array = SpanArray(target_text, tok_begins, tok_ends)
tokens_series = pd.Series(tokens_array)
# Also build single-token token-based spans to make it easier to build
# larger token-based spans.
token_spans = TokenSpanArray.from_char_offsets(tokens_series.array)
# spaCy identifies tokens by semi-arbitrary integer "indexes" (in practice,
# the offset of the first character in the token). Translate from these
# to a dense range of integer IDs that will correspond to the index of our
# returned DataFrame.
idx_to_id = {spacy_doc[i].idx: i for i in range(len(spacy_doc))}
# Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
df_cols = {
"id": range(len(tok_begins)),
"span": tokens_series,
"lemma": [t.lemma_ for t in spacy_doc],
"pos": pd.Categorical([str(t.pos_) for t in spacy_doc]),
"tag": pd.Categorical([str(t.tag_) for t in spacy_doc]),
"dep": pd.Categorical([str(t.dep_) for t in spacy_doc]),
"head": np.array([idx_to_id[t.head.idx] for t in spacy_doc]),
"shape": pd.Categorical([t.shape_ for t in spacy_doc]),
"ent_iob": pd.Categorical([str(t.ent_iob_) for t in spacy_doc],
dtype=iob2_dtype),
"ent_type": pd.Categorical([str(t.ent_type_) for t in spacy_doc]),
"is_alpha": np.array([t.is_alpha for t in spacy_doc]),
"is_stop": np.array([t.is_stop for t in spacy_doc]),
"sentence": _make_sentences_series(spacy_doc, tokens_array),
}
if add_left_and_right:
# Use nullable int type because these columns contain nulls
df_cols["left"] = pd.array(
[None] + list(range(len(tok_begins) - 1)), dtype=pd.Int32Dtype()
)
df_cols["right"] = pd.array(
list(range(1, len(tok_begins))) + [None], dtype=pd.Int32Dtype()
)
return pd.DataFrame(df_cols)
def _make_sentences_series(spacy_doc, tokens: SpanArray):
"""
Subroutine of :func:`make_tokens_and_features`
:param spacy_doc: parsed document (:class:`spacy.tokens.doc.Doc`) from a spaCy
language model
:param tokens: Token information for the current document as a
:class:`SpanArray` object. Must contain the same tokens as `spacy_doc`.
:returns: a Pandas DataFrame Series containing the token span of the (single)
sentence that the token is in
"""
num_toks = len(spacy_doc)
# Generate the [begin, end) intervals that make up a series of spans
begin_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int32)
end_tokens = np.full(shape=num_toks, fill_value=-1, dtype=np.int32)
for sent in spacy_doc.sents:
begin_tokens[sent.start: sent.end] = sent.start
end_tokens[sent.start: sent.end] = sent.end
return pd.Series(TokenSpanArray(tokens, begin_tokens, end_tokens))
[docs]def token_features_to_tree(
token_features: pd.DataFrame,
text_col: str = "span",
tag_col: str = "tag",
label_col: str = "dep",
head_col: str = "head",
):
"""
Convert a DataFrame in the format returned by `make_tokens_and_features()`
to the public input format of displaCy's dependency tree renderer.
:param token_features: A subset of a token features DataFrame in the format
returned by `make_tokens_and_features()`. Must at a minimum contain the
`head` column and an integer index that corresponds to the ints
in the `head` column.
:param text_col: Name of the column in `token_features` from which the
'covered text' label for each node of the parse tree should be extracted,
or `None` to leave those labels blank.
:param tag_col: Name of the column in `token_features` from which the
'tag' label for each node of the parse tree should be extracted; or `None`
to leave those labels blank.
:param label_col: Name of the column in `token_features` from which the
label for each edge of the parse tree should be extracted; or `None`
to leave those labels blank.
:param head_col: Name of the column in `token_features` from which the
head node of each parse tree node should be extracted.
:returns: Native Python type representation of the parse tree in a format
suitable to pass to ``displacy.render(manual=True ...)``
See https://spacy.io/usage/visualizers for the specification of this format.
"""
# displaCy expects most inputs as strings. Centralize this conversion.
def _get_text(col_name):
if col_name is None:
return np.zeros(shape=len(token_features.index), dtype=str)
series = token_features[col_name]
if isinstance(series.dtype, (SpanDtype, TokenSpanDtype)):
return series.values.covered_text
else:
return series.astype(str)
# Renumber the head column to a dense range starting from zero
tok_map = {token_features.index[i]: i for i in range(len(token_features.index))}
# Note that we turn any links to tokens not in our input rows into
# self-links, which will get removed later on.
head_tok = token_features[head_col].values
remapped_head_tok = []
for i in range(len(token_features.index)):
remapped_head_tok.append(tok_map[head_tok[i]] if head_tok[i] in tok_map else i)
words_df = pd.DataFrame({"text": _get_text(text_col), "tag": _get_text(tag_col)})
edges_df = pd.DataFrame(
{
"from": range(len(token_features.index)),
"to": remapped_head_tok,
"label": _get_text(label_col),
}
)
# displaCy requires all arcs to have their start and end be in
# numeric order. An additional attribute "dir" tells which way
# (left or right) each arc goes.
arcs_df = pd.DataFrame(
{
"start": edges_df[["from", "to"]].min(axis=1),
"end": edges_df[["from", "to"]].max(axis=1),
"label": edges_df["label"],
"dir": "left",
}
)
arcs_df["dir"].mask(edges_df["from"] > edges_df["to"], "right", inplace=True)
# Don't render self-links
arcs_df = arcs_df[arcs_df["start"] != arcs_df["end"]]
return {
"words": words_df.to_dict(orient="records"),
"arcs": arcs_df.to_dict(orient="records"),
}
[docs]def render_parse_tree(
token_features: pd.DataFrame,
text_col: str = "span",
tag_col: str = "tag",
label_col: str = "dep",
head_col: str = "head",
) -> None:
"""
Display a DataFrame in the format returned by :func:`make_tokens_and_features`
using displaCy's dependency tree renderer.
See https://spacy.io/usage/visualizers for more information on displaCy.
:param token_features: A subset of a token features DataFrame in the format
returned by `make_tokens_and_features()`. Must at a minimum contain the
`head` column and an integer index that corresponds to the ints
in the `head` column.
:param text_col: Name of the column in `token_features` from which the
'covered text' label for each node of the parse tree should be extracted,
or `None` to leave those labels blank.
:param tag_col: Name of the column in `token_features` from which the
'tag' label for each node of the parse tree should be extracted; or `None`
to leave those labels blank.
:param label_col: Name of the column in `token_features` from which the
label for each edge of the parse tree should be extracted; or `None`
to leave those labels blank.
:param head_col: Name of the column in `token_features` from which the
head node of each parse tree node should be extracted.
"""
import spacy.displacy
return spacy.displacy.render(
token_features_to_tree(token_features, text_col, tag_col, label_col,
head_col),
manual=True,
)