#
# Copyright (c) 2020 IBM Corp.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
# conll.py
"""
The ``io.conll`` module contains I/O functions related to CoNLL-2003 file format and
its derivatives, including CoNLL-U.
"""
from typing import *
import numpy as np
import pandas as pd
import regex
import requests
import os
from zipfile import ZipFile
from text_extensions_for_pandas.array.span import SpanArray
from text_extensions_for_pandas.array.token_span import (
TokenSpan,
TokenSpanArray,
)
# Special token that CoNLL-2003 format uses to delineate the documents in
# the collection.
_CONLL_DOC_SEPARATOR = "-DOCSTART-"
_EWT_DOC_SEPERATOR = "# newdoc id"
# _PUNCT_REGEX = regex.compile(f"[{string.punctuation}]+")
_PUNCT_OR_RIGHT_PAREN_REGEX = regex.compile(
# Punctuation, right paren, or apostrophe followed by 1-2 lowercase letters
# But not single or double quote, which could either begin or end a quotation
"[!#%)*+,-./:;=>?@\\]^_`|}~]|'[a-zA-Z]{1,2}"
)
# Tokens that behave like left parentheses for whitespace purposes,
# including dollar signs ("$100", not "$ 100")
_LEFT_PAREN_REGEX = regex.compile(r"[(<\[{$]+")
# _PUNCT_MATCH_FN = np.vectorize(lambda s: _PUNCT_REGEX.fullmatch(s) is not None)
_SPACE_BEFORE_MATCH_FN = np.vectorize(
lambda s: _PUNCT_OR_RIGHT_PAREN_REGEX.fullmatch(s) is not None
)
_SPACE_AFTER_MATCH_FN = np.vectorize(
lambda s: _LEFT_PAREN_REGEX.fullmatch(s) is not None
)
[docs]def default_conll_u_field_names() -> List[str]:
"""
:returns: The default set of field names (not including the required first
two fields) to use when parsing CoNLL-U files.
"""
return [
"lemma",
"upostag",
"xpostag",
"features",
"head",
"deprel",
"deps",
"misc",
]
[docs]def default_conll_u_numeric_cols() -> List[str]:
return [
"head",
"line_num",
]
# Note, Index in sentence is explicit; starts one further long
# for more information see https://universaldependencies.org/docs/format.html
def _make_empty_meta_values(
column_names: List[str], iob_columns: List[bool]
) -> Dict[str, List[Optional[Union[str, int]]]]:
ret = {}
for i in range(len(column_names)):
name = column_names[i]
if i >= len(iob_columns) or not iob_columns[i]:
ret[name] = []
else:
ret[f"{name}_iob"] = []
ret[f"{name}_type"] = []
return ret
class _SentenceData:
"""
Data structure that encapsulates one sentence's worth of data
from a parsed CoNLL-2003 file.
Not intended for use outside this file.
"""
def __init__(
self,
column_names: List[str],
iob_columns: List[bool],
predicate_args: bool,
conllu_metadata_cols: List[str] = None,
):
self._column_names = column_names
self._iob_columns = iob_columns
self._num_standard_cols = len(self._column_names)
# metadata-- init to None
self._token_metadata = None
# Surface form of token
self._tokens = [] # Type: List[str]
# Line numbers for each token from the file
self._line_nums = [] # Type: List[int]
# metadata from conll_u file
self._conllu_metadata = (
dict.fromkeys(conllu_metadata_cols, "")
if conllu_metadata_cols is not None
else None
)
self._conllu_metadata_exists = False
self._conll_09_format = predicate_args
@property
def num_tokens(self) -> int:
return len(self._tokens)
@property
def tokens(self) -> List[str]:
return self._tokens
@property
def token_metadata(self) -> Dict[str, List[str]]:
return self._token_metadata
@property
def line_nums(self):
return self._line_nums
@property
def column_names(self):
return self._column_names
@property
def conll_u_metadata_feilds(self) -> List[str]:
return (
list(self._conllu_metadata.keys())
if self._conllu_metadata is not None
else None
)
@property
def has_conll_u_metadata(self):
return self._conllu_metadata_exists
@property
def conll_09_format(self):
return self._conll_09_format
def set_conll_u_metadata(self, field: str, val: str):
if str != "":
self._conllu_metadata_exists = True
self._conllu_metadata[field] = val
self._update_conllu_metadata_exists()
def set_batch_conll_u_metadata(self, metadata: Dict[str, str]):
assert metadata.keys() <= self._conllu_metadata.keys()
self._conllu_metadata.update(metadata)
self._update_conllu_metadata_exists()
def get_conll_u_metadata(self, field: str) -> str:
return self._conllu_metadata[field]
def _update_conllu_metadata_exists(self):
self._conllu_metadata_exists = any(
[v is not None and v != "" for v in self._conllu_metadata.values()]
)
def _process_line_tags(
self,
raw_tags: List[str],
line_num: int,
line_elems: List[str],
is_conll_u: bool = False,
):
if self._token_metadata is None:
self._token_metadata = _make_empty_meta_values(
self._column_names, self._iob_columns
)
for i in range(len(raw_tags)):
raw_tag = raw_tags[i]
name = self._column_names[i]
if not self._iob_columns[i]:
# non-IOB data
self._token_metadata[name].append(raw_tag)
else:
# IOB-format data; split into two values
if raw_tag.startswith("I-") or raw_tag.startswith("B-"):
# Tokens that are entities are tagged with tags like
# "I-PER" or "B-MISC".
tag, entity = raw_tag.split("-")
elif raw_tag == "O":
tag = raw_tag
entity = None
elif (not is_conll_u) and raw_tag == "-X-":
# Special metadata value for -DOCSTART- tags in the CoNLL corpus.
tag = "O"
entity = None
else:
raise ValueError(
f"Tag '{raw_tag}' of IOB-format field {i} at line "
f"{line_num} does not start with 'I-', 'O', "
f"or 'B-'.\n"
f"Fields of line are: {line_elems}"
)
self._token_metadata[f"{name}_iob"].append(tag)
self._token_metadata[f"{name}_type"].append(entity)
def add_line(self, line_num: int, line_elems: List[str]):
"""
:param line_num: Location in file, for error reporting
:param line_elems: Fields of a line, pre-split
"""
if len(line_elems) != 1 + len(self._column_names):
raise ValueError(
f"Unexpected number of elements {len(line_elems)} "
f"at line {line_num}; expected "
f"{1 + len(self._column_names)} elements."
)
token = line_elems[0]
raw_tags = line_elems[1:]
self._tokens.append(token)
self._line_nums.append(line_num)
self._process_line_tags(raw_tags, line_num, line_elems, is_conll_u=False)
def add_line_conllu(self, line_num: int, line_elems: List[str]):
"""
Similar to add_line, but handles additional logic for conllu files.
This includes the additional ignored entries on the left for word indexes within
:param line_num: Location in file, for error reporting
:param line_elems: Fields of a line, pre-split
"""
if len(line_elems) < 2 + len(self._column_names):
if len(line_elems) >= 2 + self._num_standard_cols:
line_elems.extend(
["_" for _ in range(2 + len(self._column_names) - len(line_elems))]
)
else:
raise ValueError(
f"Unexpected number of elements {len(line_elems)} "
f"at line {line_num}; expected "
f"{2 + len(self._column_names)} elements, "
f"got {len(line_elems)} instead."
f" min_num: {self._num_standard_cols}"
f"\nline reads: '{' '.join(line_elems) }'"
)
if (
len(line_elems) > 2 + len(self._column_names)
and self._conll_09_format
and self.num_tokens == 0
):
# only modify once per sentence
additional_lines = len(line_elems) - (3 + len(self._column_names))
self._column_names.append("predicate")
addnl_col_names = [f"pred{i}arg" for i in range(additional_lines)]
self._column_names.extend(addnl_col_names)
self._iob_columns.extend([False for _ in range(additional_lines + 1)])
# print(f"found Conll9 format. Added{additional_lines} columns. cols are now {self._column_names}")
assert len(self._column_names) + 2 == len(line_elems)
token = line_elems[1]
raw_tags = line_elems[2:len(self._column_names) + 2]
raw_tags = [None if tag == "_" else tag for tag in raw_tags]
self._tokens.append(token)
self._line_nums.append(line_num)
# because we do not combine
self._process_line_tags(raw_tags, line_num, line_elems, is_conll_u=True)
def _parse_conll_file(
input_file: str, column_names: List[str], iob_columns: List[bool]
) -> List[List[_SentenceData]]:
"""
Parse the CoNLL-2003 file format for training/test data to Python objects.
The format is especially tricky, so everything here is straight non-vectorized
Python code. If you want performance, write the contents of your CoNLL files back
out into a file format that supports performance.
:param input_file: Location of the file to read
:param column_names: Names for the metadata columns that come after the
token text. These names will be used to generate the names of the dataframe
that this function returns.
:param iob_columns: Mask indicating which of the metadata columns after the
token text should be treated as being in IOB format. If a column is in IOB format,
the returned data structure will contain *two* columns, holding IOB tags and
entity type tags, respectively. For example, an input column "ent" will turn into
output columns "ent_iob" and "ent_type".
:returns: A list of lists of _SentenceData objects. The top list has one entry per
document. The next level lists have one entry per sentence.
"""
with open(input_file, "r") as f:
lines = f.readlines()
# Build up a list of document metadata as Python objects
docs = [] # Type: List[List[Dict[str, List[str]]]]
current_sentence = _SentenceData(column_names, iob_columns, False)
# Information about the current document
sentences = [] # Type: SentenceData
for i in range(len(lines)):
line = lines[i].strip()
if 0 == len(line):
# Blank line is the sentence separator
if current_sentence.num_tokens > 0:
sentences.append(current_sentence)
current_sentence = _SentenceData(column_names, iob_columns, False)
else:
# Not at the end of a sentence
line_elems = line.split(" ")
current_sentence.add_line(i, line_elems)
if line_elems[0] == _CONLL_DOC_SEPARATOR and i > 0:
# End of document. Wrap up this document and start a new one.
#
# Note that the special "start of document" token is considered part
# of the document. If you do not follow this convention, the
# result sets from CoNLL 2003 won't line up.
# Note also that `current_sentence` is not in `sentences` and will be
# added to the next document.
docs.append(sentences)
sentences = []
# Close out the last sentence and document, if needed
if current_sentence.num_tokens > 0:
sentences.append(current_sentence)
if len(sentences) > 0:
docs.append(sentences)
return docs
def _parse_conll_u_file(
input_file: str,
column_names: List[str],
iob_columns: List[bool],
predicate_args: bool = True,
merge_subtokens: bool = False,
merge_subtoken_separator: str = "|",
metadata_fields: Dict[str, str] = None,
doc_seperator=_EWT_DOC_SEPERATOR
) -> List[List[_SentenceData]]:
"""
The format is especially tricky, so everything here is straight non-vectorized Python
code. If you want performance, write the contents of your CoNLL files back out into a
file format that supports performance.
:param input_file: Location of the file to read
:param column_names: Names for the metadata columns that come after the
token text. These names will be used to generate the names of the dataframe
that this function returns.
:param iob_columns: Mask indicating which of the metadata columns after the
token text should be treated as being in IOB format. If a column is in IOB format,
the returned data structure will contain *two* columns, holding IOB tags and
entity type tags, respectively. For example, an input column "ent" will turn into
output columns "ent_iob" and "ent_type".
:param predicate_args: whether or not predicate arguments are stored in this file
format.
:param metadata_fields: Optional. The types of metadata fields you want to store
from the document. Stored in the form of dictionary: tag_in_text -> "pretty" tag
(i.e. what you want to show in the output).
If no value is provided, then the return value of :func:`default_ewt_metadata()`
will be used.
:returns: A list of lists of _SentenceData objects. The top list has one entry per
document. The next level lists have one entry per sentence.
"""
if metadata_fields is None:
metadata_fields = default_ewt_metadata()
with open(input_file, "r") as f:
lines = f.readlines()
# Build up a list of document metadata as Python objects
docs = [] # Type: List[List[Dict[str, List[str]]]]
# metadata specific to conll_u
metadata_names = list(metadata_fields.values())
u_metadata = dict.fromkeys(metadata_names, "")
current_sentence = _SentenceData(
column_names.copy(), iob_columns.copy(), predicate_args, metadata_names
)
# Information about the current document
sentences = [] # Type: SentenceData
# if we merge subtokens we need additional logic
in_subtok = False # set this flag when inside of subtoken
subtok_end = None # only valid when in subtok
for i in range(len(lines)):
line = lines[i].strip()
if 0 == len(line):
# Blank line is the sentence separator
if current_sentence.num_tokens > 0:
sentences.append(current_sentence)
current_sentence = _SentenceData(
column_names.copy(),
iob_columns.copy(),
predicate_args,
metadata_names,
)
current_sentence.set_batch_conll_u_metadata(u_metadata)
elif line[0] == "#":
line_elems = line.split(" = ")
if line_elems[0] == doc_seperator:
if i > 0 and len(sentences) > 0:
# End of document. Wrap up this document and start a new one.
#
docs.append(sentences)
sentences = []
# reset doc, paragraph and sentence id's
# now check for metadata
line_elems[0] = line_elems[0].strip("# ")
if line_elems[0] in metadata_fields.keys():
key = metadata_fields[line_elems[0]]
current_sentence.set_conll_u_metadata(key, line_elems[1])
u_metadata[key] = line_elems[1]
elif not in_subtok:
# Not at the end of a sentence, or in a subtok
line_elems = line.split("\t")
# Ignore multi-word tokens for now; just use word sequence; may want to change, but we'd need to
# interpret each sub-word's info
if "-" not in line_elems[0]: # checks if has range
current_sentence.add_line_conllu(i, line_elems)
elif merge_subtokens:
in_subtok = True
# find start and end of range
start, end = line_elems[0].split("-")
subtok_end = (
int(end) - int(start) + i + 1
) # the end (inclusive) of subtoken, by global index
comb_elem_list = [[] for i in range(len(line_elems))]
for subtoken in lines[i + 1:subtok_end + 1]:
subtok_elems = subtoken.split("\t")
for field in range(2, len(line_elems)):
if subtok_elems[field] != "_":
comb_elem_list[field].append(subtok_elems[field])
combined_elems = line_elems[0:2] # first line is the same
for elem_list in comb_elem_list[2:]:
combined_elems.append(merge_subtoken_separator.join(elem_list))
current_sentence.add_line_conllu(i, combined_elems)
if in_subtok and i >= subtok_end:
in_subtok = False
subtok_end = None
# Close out the last sentence and document, if needed
if current_sentence.num_tokens > 0:
sentences.append(current_sentence)
if len(sentences) > 0:
docs.append(sentences)
return docs
def _parse_conll_output_file(
doc_dfs: List[pd.DataFrame], input_file: str
) -> List[Dict[str, List[str]]]:
"""
Parse the CoNLL-2003 file format for output data to Python
objects. This format is similar to the format that `_parse_conll_file`
produces, but without the token and document boundary information.
:param doc_dfs: List of `pd.DataFrame`s of token information from the
corresponding training data file, one `DataFrame` per document.
Used for determining document boundaries, which are not encoded in
CoNLL-2003 output file format.
:param input_file: Location of the file to read
:returns: A list of dicts. The top list has one entry per
document. The next level contains lists under the following keys:
* `iob`: List of IOB2 tags as strings. This function does **NOT**
correct for the silly way that CoNLL-format uses "B" tags. See
`_fix_iob_tags()` for that correction.
* `entity`: List of entity tags where `iob` contains I's or B's.
`None` everywhere else.
"""
with open(input_file, "r") as f:
lines = f.readlines()
# Build up a list of document metadata as Python objects
docs = [] # Type: List[Dict[str, List[str]]]
# Position in the corpus
doc_num = 0
num_tokens_in_doc = len(doc_dfs[doc_num].index)
token_num = 0
# Information about the current document's tokens
iobs = [] # Type: List[str]
entities = [] # Type: List[str]
for i in range(len(lines)):
line = lines[i].strip()
if 0 == len(line):
# Blank line is the sentence separator.
continue
if " " in line:
raise ValueError(
f"Line {i} contains unexpected space character.\n" f"Line was: '{line}'"
)
raw_tag = line
if raw_tag.startswith("I") or raw_tag.startswith("B"):
# Tokens that are entities are tagged with tags like
# "I-PER" or "B-MISC".
tag, entity = raw_tag.split("-")
elif raw_tag == "O":
tag = raw_tag
entity = None
else:
raise ValueError(
f"Unexpected tag {raw_tag} at line {i}.\n" f"Line was: '{line}'"
)
iobs.append(tag)
entities.append(entity)
token_num += 1
if token_num == num_tokens_in_doc:
# End of current document, advance to next
docs.append({"iob": iobs, "entity": entities})
iobs = []
entities = []
doc_num += 1
token_num = 0
if doc_num < len(doc_dfs):
num_tokens_in_doc = len(doc_dfs[doc_num].index)
if doc_num < len(doc_dfs):
print(
f"WARNING: Corpus has {len(doc_dfs)} documents, but "
f"only found outputs for {doc_num} of them."
)
# raise ValueError(f"Corpus has {len(doc_dfs)} documents, but "
# f"only found outputs for {doc_num} of them.")
return docs
def _iob_to_iob2(
df: pd.DataFrame, column_names: List[str], iob_columns: List[bool]
) -> pd.DataFrame:
"""
In CoNLL-2003 format, entities are stored in IOB format, where the first
token of an entity is only tagged "B" when there are two entities of the
same type back-to-back. This format makes downstream processing difficult.
If a given position has an `I` tag, that position may or may not be the
first token of an entity. Code will need to inspect both the I/O/B tags
*and* the entity type of multiple other tokens *and* the boundaries between
sentences to disambiguate between those two cases.
This function converts these IOB tags to the easier-to-consume IOB2 format;
see
https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
for details. Basically, every entity in IOB2 format begins with a `B` tag.
The `I` tag is only used for the second, third, etc. tokens of an entity.
:param df: A `pd.DataFrame` with one row per token of the document.
In addition to the metadata columns corresponding to `column_names`, this
dataframe must also contain sentence information in a column called `sentence`.
:param column_names: Names for the metadata columns in the original data file
that were used to generate the names of the columns of `df`.
:param iob_columns: Mask indicating which of the metadata columns after the
token text should be treated as being in IOB format.
:returns: A version of `df` with corrected IOB2 tags in the `ent_iob`
column. The original dataframe is not modified.
"""
ret = df.copy()
sentence_begins = df["sentence"].values.begin_token
for col_num in range(len(iob_columns)):
if iob_columns[col_num]:
name = column_names[col_num]
iobs = df[f"{name}_iob"].values.copy() # Modified in place
entities = df[f"{name}_type"].values
# Special-case the first one
if iobs[0] == "I":
iobs[0] = "B"
for iob_num in range(1, len(iobs)):
tag = iobs[iob_num]
prev_tag = iobs[iob_num - 1]
if tag == "I":
if (
prev_tag == "O" # Previous token not an entity
or (
prev_tag in ("I", "B")
and entities[iob_num] != entities[iob_num - 1]
) # Previous token a different type of entity
or (
sentence_begins[iob_num] != sentence_begins[iob_num - 1]
) # Start of new sentence
):
iobs[iob_num] = "B"
ret[f"{name}_iob"] = iobs
return ret
def _doc_to_df(
doc: List[_SentenceData],
column_names: List[str],
iob_columns: List[bool],
space_before_punct: bool,
conll_u: bool = False,
) -> pd.DataFrame:
"""
Convert the "Python objects" representation of a document from a
CoNLL-2003 file into a `pd.DataFrame` of token metadata.
:param doc: List of Python objects that represents the document.
:param column_names: Names for the metadata columns that come after the
token text. These names will be used to generate the names of the dataframe
that this function returns.
:param iob_columns: Mask indicating which of the metadata columns after the
token text should be treated as being in IOB format. If a column is in IOB format,
the returned dataframe will contain *two* columns, holding IOB2 tags and
entity type tags, respectively. For example, an input column "ent" will turn into
output columns "ent_iob" and "ent_type".
:param space_before_punct: If `True`, add whitespace before
punctuation characters (and after left parentheses)
when reconstructing the text of the document.
:return: DataFrame with four columns:
* `span`: Span of each token, with character offsets.
Backed by the concatenation of the tokens in the document into
a single string with one sentence per line.
* `ent_iob`: IOB2-format tags of tokens, exactly as they appeared
in the original file, with no corrections applied.
* `ent_type`: Entity type names for tokens tagged "I" or "B" in
the `ent_iob` column; `None` everywhere else.
* `line_num`: line number of each token in the parsed file
"""
# Character offsets of tokens in the reconstructed document
begins_list = [] # Type: List[np.ndarray]
ends_list = [] # Type: List[np.ndarray]
# Reconstructed text of each sentence
sentences_list = [] # Type: List[np.ndarray]
# Token offsets of sentences containing each token in the document.
sentence_begins_list = [] # Type: List[np.ndarray]
sentence_ends_list = [] # Type: List[np.ndarray]
# conll_u metadata information.
conll_u_ids_exsist = doc is not None and len(doc) != 0 and doc[0].has_conll_u_metadata
conll_2009_format = doc is not None and len(doc) != 0 and doc[0].conll_09_format
# this should be the same for all sentences so we check the first
if conll_2009_format:
max_list = max(doc, key=lambda sent: len(sent.column_names)).column_names
if len(max_list) > len(column_names):
column_names = max_list
# Token metadata column values. Key is column name, value is metadata for
# each token.
if conll_u_ids_exsist:
meta_lists = _make_empty_meta_values(
column_names + doc[0].conll_u_metadata_feilds, iob_columns
)
else:
meta_lists = _make_empty_meta_values(column_names, iob_columns)
# Line numbers of the parsed file for each token in the doc
doc_line_nums = []
char_position = 0
token_position = 0
for sentence_num in range(len(doc)):
sentence = doc[sentence_num]
tokens = sentence.tokens
# Don't put spaces before punctuation in the reconstituted string.
no_space_before_mask = (
np.zeros(len(tokens), dtype=bool)
if space_before_punct
else _SPACE_BEFORE_MATCH_FN(tokens)
)
no_space_after_mask = (
np.zeros(len(tokens), dtype=bool)
if space_before_punct
else _SPACE_AFTER_MATCH_FN(tokens)
)
no_space_before_mask[0] = True # No space before first token
no_space_after_mask[-1] = True # No space after last token
shifted_no_space_after_mask = np.roll(no_space_after_mask, 1)
prefixes = np.where(
np.logical_or(no_space_before_mask, shifted_no_space_after_mask), "", " "
)
string_parts = np.ravel((prefixes, tokens), order="F")
sentence_text = "".join(string_parts)
sentences_list.append(sentence_text)
lengths = np.array([len(t) for t in tokens])
prefix_lengths = np.array([len(p) for p in prefixes])
# Begin and end offsets, accounting for which tokens have spaces
# before them.
e = np.cumsum(lengths + prefix_lengths)
b = e - lengths
begins_list.append(b + char_position)
ends_list.append(e + char_position)
sentence_begin_token = token_position
sentence_end_token = token_position + len(e)
sentence_begins = np.repeat(sentence_begin_token, len(e))
sentence_ends = np.repeat(sentence_end_token, len(e))
sentence_begins_list.append(sentence_begins)
sentence_ends_list.append(sentence_ends)
for k in meta_lists.keys():
if k in sentence.token_metadata.keys():
meta_lists[k].extend(sentence.token_metadata[k])
elif conll_u_ids_exsist and k in sentence.conll_u_metadata_feilds:
data = sentence.get_conll_u_metadata(k)
meta_lists[k].extend([data for _ in range(sentence.num_tokens)])
else:
meta_lists[k].extend([None for _ in range(sentence.num_tokens)])
char_position += e[-1] + 1 # "+ 1" to account for newline
token_position += len(e)
doc_line_nums.extend(sentence.line_nums)
# move "head" indices so they point at the right words
if conll_u and "head" in column_names:
for i in range(sentence_begin_token, sentence_end_token):
val = meta_lists["head"][i]
if val is not None:
points_to = int(val)
meta_lists["head"][i] = (
points_to + sentence_begin_token - 1 if points_to != 0 else -1
)
begins = np.concatenate(begins_list)
ends = np.concatenate(ends_list)
doc_text = "\n".join(sentences_list)
char_spans = SpanArray(doc_text, begins, ends)
sentence_spans = TokenSpanArray(
char_spans,
np.concatenate(sentence_begins_list),
np.concatenate(sentence_ends_list),
)
ret = pd.DataFrame({"span": char_spans})
for k, v in meta_lists.items():
ret[k] = v
ret["sentence"] = sentence_spans
ret["line_num"] = pd.Series(doc_line_nums)
if conll_u and "head" in column_names:
ret = ret.astype({"head": "Int64"}, errors="ignore")
ret.loc[ret["head"] == -1, "head"] = pd.NA
return ret
def _output_doc_to_df(
tokens: pd.DataFrame,
outputs: Dict[str, List[str]],
column_name: str,
copy_tokens: bool,
) -> pd.DataFrame:
"""
Convert the "Python objects" representation of a document from a
CoNLL-2003 file into a `pd.DataFrame` of token metadata.
:param tokens: `pd.DataFrame` containing metadata about the tokens
of this document, as returned by `conll_2003_to_dataframe`
:param outputs: Dictionary containing outputs for this document,
with fields "iob" and "entity".
:param column_name: Name for the metadata value that the IOB-tagged data
in `input_file` encodes. If this name is present in `doc_dfs`, its value
will be replaced with the data from `input_file`; otherwise a new column
will be added to each dataframe.
:param copy_tokens: `True` if token information should be deep-copied.
:return: DataFrame with four columns:
* `span`: Span of each token, with character offsets.
Backed by the concatenation of the tokens in the document into
a single string with one sentence per line.
* `ent_iob`: IOB2-format tags of tokens, corrected so that every
entity begins with a "B" tag.
* `ent_type`: Entity type names for tokens tagged "I" or "B" in
the `ent_iob` column; `None` everywhere else.
"""
if copy_tokens:
return pd.DataFrame(
{
"span": tokens["span"].copy(),
f"{column_name}_iob": np.array(outputs["iob"]),
f"{column_name}_type": np.array(outputs["entity"]),
"sentence": tokens["sentence"].copy(),
}
)
else:
return pd.DataFrame(
{
"span": tokens["span"],
f"{column_name}_iob": np.array(outputs["iob"]),
f"{column_name}_type": np.array(outputs["entity"]),
"sentence": tokens["sentence"],
}
)
#####################################################
# External API functions below this line
[docs]def iob_to_spans(
token_features: pd.DataFrame,
iob_col_name: str = "ent_iob",
span_col_name: str = "span",
entity_type_col_name: str = "ent_type",
):
"""
Convert token tags in Inside–Outside–Beginning (IOB2) format to a series of
:class:`TokenSpan` objects of entities. See See wikipedia_ for more information
on the IOB2 format.
.. _wikipedia: https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
:param token_features: DataFrame of token features in the format returned by
:func:`make_tokens_and_features`.
:param iob_col_name: Name of a column in ``token_features`` that contains the
IOB2 tags as strings, "I", "O", or "B".
:param span_col_name: Name of a column in ``token_features`` that
contains the tokens as a :class:`SpanArray`.
:param entity_type_col_name: Optional name of a column in ``token_features``
that contains entity type information; or ``None`` if no such column exists.
:returns: A :class:`pd.DataFrame` with the following columns:
* ``span``: Span (with token offsets) of each entity
* ``<value of entity_type_col_name>``: (optional) Entity type
"""
# Start out with 1-token prefixes of all entities.
begin_mask = token_features[iob_col_name] == "B"
first_tokens = token_features[begin_mask].index
if entity_type_col_name is None:
entity_types = np.zeros(len(first_tokens))
else:
entity_types = token_features[begin_mask][entity_type_col_name]
# Add an extra "O" tag to the end of the IOB column to simplify the logic
# for handling the case where the document ends with an entity.
iob_series = (
token_features[iob_col_name].append(pd.Series(["O"])).reset_index(drop=True)
)
entity_prefixes = pd.DataFrame(
{
"ent_type": entity_types,
"begin": first_tokens, # Inclusive
"end": first_tokens + 1, # Exclusive
"next_tag": iob_series.iloc[first_tokens + 1].values,
}
)
df_list = [] # Type: pd.DataFrame
if len(entity_prefixes.index) == 0:
# Code below needs at least one element in the list for schema
df_list = [entity_prefixes]
# Iteratively expand the prefixes
while len(entity_prefixes.index) > 0:
complete_mask = entity_prefixes["next_tag"].isin(["O", "B"])
complete_entities = entity_prefixes[complete_mask]
incomplete_entities = entity_prefixes[~complete_mask].copy()
incomplete_entities["end"] = incomplete_entities["end"] + 1
incomplete_entities["next_tag"] = iob_series.iloc[
incomplete_entities["end"]
].values
df_list.append(complete_entities)
entity_prefixes = incomplete_entities
all_entities = pd.concat(df_list)
# Sort spans by location, not length.
all_entities.sort_values("begin", inplace=True)
# Convert [begin, end) pairs to spans
entity_spans_array = TokenSpanArray(
token_features[span_col_name].values,
all_entities["begin"].values,
all_entities["end"].values,
)
if entity_type_col_name is None:
return pd.DataFrame({"span": entity_spans_array})
else:
return pd.DataFrame(
{
"span": entity_spans_array,
entity_type_col_name: all_entities["ent_type"].values,
}
)
[docs]def spans_to_iob(
token_spans: Union[TokenSpanArray, List[TokenSpan], pd.Series],
span_ent_types: Union[str, Iterable, np.ndarray, pd.Series] = None,
) -> pd.DataFrame:
"""
Convert a series of :class:`TokenSpan` objects of entities to token tags in
Inside–Outside–Beginning (IOB2) format. See wikipedia_ for more information
on the IOB2 format.
.. _wikipedia: https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
:param token_spans: An object that can be converted to a :class:`TokenSpanArray` via
:func:`TokenSpanArray.make_array`. Should contain :class:`TokenSpan` objects
aligned with the target tokenization. All spans must be from the same document.
Usually you create this array by calling :func:`TokenSpanArray.align_to_tokens`.
:param span_ent_types: List of entity type strings corresponding to each of the
elements of ``token_spans``, or ``None`` to indicate null entity tags.
:returns: A :class:`pd.DataFrame` with two columns:
* "ent_iob": IOB2 tags as strings "ent_iob"
* "ent_type": Entity type strings (or NaN values if ``ent_types`` is ``None``)
"""
# Normalize inputs
token_spans = TokenSpanArray.make_array(token_spans)
if span_ent_types is None:
span_ent_types = [None] * len(token_spans)
elif isinstance(span_ent_types, str):
span_ent_types = [span_ent_types] * len(token_spans)
elif isinstance(span_ent_types, pd.Series):
span_ent_types = span_ent_types.values
# Define the IOB categorical type with "O" == 0, "B"==1, "I"==2
iob2_dtype = pd.CategoricalDtype(["O", "B", "I"], ordered=False)
# Handle an empty token span array
if len(token_spans) == 0:
return pd.DataFrame(
{
"ent_iob": pd.Series(dtype=iob2_dtype),
"ent_type": pd.Series(dtype="string"),
}
)
# All code that follows assumes at least one input span. All spans should
# be from the same document; otherwise there isn't a meaningful IOB
# representation of the entities.
if not token_spans.is_single_tokenization:
raise ValueError(
f"All input spans must be from the same tokenization of "
f"the same document "
f"(spans are {token_spans})"
)
tokens = token_spans.tokens[0]
# Initialize an IOB series with all 'O' entities
iob_data = np.zeros_like(tokens.begin, dtype=np.int64)
iob_tags = pd.Categorical.from_codes(codes=iob_data, dtype=iob2_dtype)
# Assign the begin tags
iob_tags[token_spans.begin_token] = "B"
# Fill in the remaining inside tags
i_lengths = token_spans.end_token - (token_spans.begin_token + 1)
i_mask = i_lengths > 0
i_begins = token_spans.begin_token[i_mask] + 1
i_ends = token_spans.end_token[i_mask]
for begin, end in zip(i_begins, i_ends):
iob_tags[begin:end] = "I"
# Use a similar process to generate entity type tags
ent_types = np.full(len(tokens), np.object_(None), dtype=object)
for ent_type, begin, end in zip(
span_ent_types, token_spans.begin_token, token_spans.end_token
):
ent_types[begin:end] = ent_type
return pd.DataFrame(
{"ent_iob": iob_tags, "ent_type": pd.Series(ent_types, dtype="string")}
)
[docs]def conll_2003_to_dataframes(
input_file: str,
column_names: List[str],
iob_columns: List[bool],
space_before_punct: bool = False,
) -> List[pd.DataFrame]:
"""
Parse a file in CoNLL-2003 training/test format into a DataFrame.
CoNLL-2003 training/test format looks like this::
-DOCSTART- -X- -X- O
CRICKET NNP I-NP O
- : O O
LEICESTERSHIRE NNP I-NP I-ORG
TAKE NNP I-NP O
OVER IN I-PP O
AT NNP I-NP O
Note the presence of the surface forms of tokens at the beginning
of the lines.
:param input_file: Location of input file to read.
:param space_before_punct: If ``True``, add whitespace before
punctuation characters when reconstructing the text of the document.
:param column_names: Names for the metadata columns that come after the
token text. These names will be used to generate the names of the dataframe
that this function returns.
:param iob_columns: Mask indicating which of the metadata columns after the
token text should be treated as being in IOB format. If a column is in IOB format,
the returned dataframe will contain *two* columns, holding **IOB2** tags and
entity type tags, respectively. For example, an input column "ent" will turn into
output columns "ent_iob" and "ent_type".
:returns: A list containing, for each document in the input file,
a separate :class:`pd.DataFrame` of four columns:
* **span**: Span of each token, with character offsets.
Backed by the concatenation of the tokens in the document into
a single string with one sentence per line.
* **ent_iob** IOB2-format tags of tokens, corrected so that every
entity begins with a "B" tag.
* **ent_type**: Entity type names for tokens tagged "I" or "B" in
the `ent_iob` column; `None` everywhere else.
"""
parsed_docs = _parse_conll_file(input_file, column_names, iob_columns)
doc_dfs = [
_doc_to_df(d, column_names, iob_columns, space_before_punct)
for d in parsed_docs
]
return [_iob_to_iob2(d, column_names, iob_columns) for d in doc_dfs]
[docs]def conll_u_to_dataframes(
input_file: str,
column_names: List[str] = None,
iob_columns: List[bool] = None,
has_predicate_args: bool = True,
space_before_punct: bool = False,
merge_subtokens: bool = False,
merge_subtoken_separator: str = "|",
numeric_cols: List[str] = None,
metadata_fields: Dict[str, str] = None,
separate_sentences_by_doc: bool = False
) -> List[pd.DataFrame]:
"""
Parses a file from
:param input_file: Location of input file to read.
:param space_before_punct: If `True`, add whitespace before
punctuation characters when reconstructing the text of the document.
:param column_names: Optional. Names for the metadata columns that come after the
token text. These names will be used to generate the names of the dataframe
that this function returns.
If no value is provided, these default to the list returned by
:func:`default_conll_u_field_names`, which is also the format defined at
https://universaldependencies.org/docs/format.html.
:param iob_columns: Mask indicating which of the metadata columns after the
token text should be treated as being in IOB format. If a column is in IOB format,
the returned dataframe will contain *two* columns, holding **IOB2** tags and
entity type tags, respectively. For example, an input column "ent" will turn into
output columns "ent_iob" and "ent_type". By default in CONLL_U or EWT formats this
is all false.
:param has_predicate_args: Whether or not the file format includes predicate args.
True by default, and should support most files in the conllu format, but will assume
that any tabs in the last element are additional predicate arguments
:param merge_subtokens: dictates how to handle tokens that are smaller than one word.
By default, we keep the subtokens as two seperate entities, but if this is set to
``True``, the subtokens will be merged into a single entity, of the same length as
the token, and their attributes will be concatenated
:param merge_subtoken_separator: If merge subtokens is selected, concatenate the
attributes with this separator, by default '|'
:param numeric_cols: Optional: Names of numeric columns drawn from `column_names`,
plus the default "built-in" column name `line_num`.
Any column whose name is in this list will be considered to hold numeric values.
Column names not present in the `column_names` argument will be ignored.
If no value is provided, then the return value of
:func:`default_conll_u_numeric_cols` will be used.
:param metadata_fields: Optional. Types of metadata fields you want to store from the
document, in the form of a dictionary: tag_in_text -> "pretty" tag (i.e. what you
want to show in the output).
If no value is provided, then the return value of :func:`default_ewt_metadata()`
will be used.
:param separate_sentences_by_doc: Optional. If ``False`` (the default behavior),
use the document boundaries defined in the CoNLL-U file. If ``True``, then treat
each sentence in the input file as a separate document.
:returns: A list containing, for each document in the input file,
a separate `pd.DataFrame` of four columns:
* `span`: Span of each token, with character offsets.
Backed by the concatenation of the tokens in the document into
a single string with one sentence per line.
* `ent_iob`: IOB2-format tags of tokens, corrected so that every
entity begins with a "B" tag.
* `ent_type`: Entity type names for tokens tagged "I" or "B" in
the `ent_iob` column; `None` everywhere else.
"""
# Fill in default values
if column_names is None:
column_names = default_conll_u_field_names()
if iob_columns is None:
iob_columns = [False] * len(column_names)
# fill with falses if not specified
if metadata_fields is None:
metadata_fields = default_ewt_metadata()
if numeric_cols is None:
numeric_cols = default_conll_u_numeric_cols()
#
split_doc_by = "# text" if separate_sentences_by_doc else _EWT_DOC_SEPERATOR
parsed_docs = _parse_conll_u_file(
input_file,
column_names,
iob_columns,
has_predicate_args,
merge_subtokens=merge_subtokens,
merge_subtoken_separator=merge_subtoken_separator,
metadata_fields=metadata_fields,
doc_seperator=split_doc_by
)
doc_dfs = [
_doc_to_df(d, column_names, iob_columns, space_before_punct, conll_u=True)
for d in parsed_docs
]
ret = [_iob_to_iob2(d, column_names, iob_columns) for d in doc_dfs]
for d in ret:
for col in numeric_cols:
if col in d:
d[col] = pd.to_numeric(d[col], errors="coerce")
return ret
[docs]def conll_2003_output_to_dataframes(
doc_dfs: List[pd.DataFrame],
input_file: str,
column_name: str = "ent",
copy_tokens: bool = False,
) -> List[pd.DataFrame]:
"""
Parse a file in CoNLL-2003 output format into a DataFrame.
CoNLL-2003 output format looks like this::
O
O
I-LOC
O
O
I-PER
I-PER
Note the lack of any information about the tokens themselves. Note
also the lack of any information about document boundaries.
:param doc_dfs: List of :class:`pd.DataFrame`s of token information, as
returned by :func:`conll_2003_to_dataframes`. This is needed because
CoNLL-2003 output format does not include any information about
document boundaries.
:param input_file: Location of input file to read.
:param column_name: Name for the metadata value that the IOB-tagged data
in ``input_file`` encodes. If this name is present in ``doc_dfs``, its value
will be replaced with the data from ``input_file``; otherwise a new column
will be added to each dataframe.
:param copy_tokens: If ``True``, deep-copy token series from the
elements of `doc_dfs` instead of using pointers.
:returns: A list containing, for each document in the input file,
a separate :class:`pd.DataFrame` of four columns:
* **span**: Span of each token, with character offsets.
Backed by the concatenation of the tokens in the document into
a single string with one sentence per line.
* **token_span**: Span of each token, with token offsets.
Backed by the contents of the `span` column.
* **<column_name>_iob**: IOB2-format tags of tokens, corrected so that every
entity begins with a "B" tag.
* **<column_name>_type**: Entity type names for tokens tagged "I" or "B" in
the ``<column_name>_iob`` column; ``None`` everywhere else.
"""
docs_list = _parse_conll_output_file(doc_dfs, input_file)
return [
_iob_to_iob2(
_output_doc_to_df(tokens, outputs, column_name, copy_tokens),
[column_name],
[True],
)
for tokens, outputs in zip(doc_dfs, docs_list)
]
[docs]def make_iob_tag_categories(
entity_types: List[str],
) -> Tuple[pd.CategoricalDtype, List[str], Dict[str, int]]:
"""
Enumerate all the possible token categories for combinations of
IOB tags and entity types (for example, ``I + "PER" ==> "I-PER"``).
Generate a consistent mapping from these strings to integers.
:param entity_types: Allowable entity type strings for the corpus
:returns: A triple of:
* Pandas CategoricalDtype
* mapping from integer to string label, as a list. This mapping is guaranteed
to be consistent with the mapping in the Pandas CategoricalDtype in the first
return value.
* mapping string label to integer, as a dict; the inverse of the second return
value.
"""
int_to_label = ["O"] + [f"{x}-{y}" for x in ["B", "I"] for y in entity_types]
label_to_int = {int_to_label[i]: i for i in range(len(int_to_label))}
token_class_dtype = pd.CategoricalDtype(categories=int_to_label)
return token_class_dtype, int_to_label, label_to_int
[docs]def add_token_classes(
token_features: pd.DataFrame,
token_class_dtype: pd.CategoricalDtype = None,
iob_col_name: str = "ent_iob",
entity_type_col_name: str = "ent_type",
) -> pd.DataFrame:
"""
Add additional columns to a dataframe of IOB-tagged tokens containing composite
string and integer category labels for the tokens.
:param token_features: Dataframe of tokens with IOB tags and entity type strings
:param token_class_dtype: Optional Pandas categorical dtype indicating how to map
composite tags like `I-PER` to integer values.
You can use :func:`make_iob_tag_categories` to generate this dtype.
If this parameter is not provided, this function will use an arbitrary mapping
using the values that appear in this dataframe.
:param iob_col_name: Optional name of a column in `token_features` that contains the
IOB2 tags as strings, "I", "O", or "B".
:param entity_type_col_name: Optional name of a column in `token_features`
that contains entity type information; or `None` if no such column exists.
:returns: A copy of `token_features` with two additional columns, `token_class`
(string class label) and `token_class_id` (integer label).
If `token_features` contains columns with either of these names, those columns will
be overwritten in the returned copy of `token_features`.
"""
if token_class_dtype is None:
empty_mask = token_features[entity_type_col_name].isna() | (
token_features[entity_type_col_name] == ""
)
token_class_type, _, label_to_int = make_iob_tag_categories(
list(token_features[~empty_mask][entity_type_col_name].unique())
)
else:
label_to_int = {
token_class_dtype.categories[i]: i
for i in range(len(token_class_dtype.categories))
}
elems = [] # Type: str
for index, row in token_features[[iob_col_name, entity_type_col_name]].iterrows():
if row[iob_col_name] == "O":
elems.append("O")
else:
elems.append(f"{row[iob_col_name]}-{row[entity_type_col_name]}")
ret = token_features.copy()
ret["token_class"] = pd.Categorical(elems, dtype=token_class_dtype)
ret["token_class_id"] = [label_to_int[elem] for elem in elems]
return ret
[docs]def decode_class_labels(class_labels: Iterable[str]):
"""
Decode the composite labels that :func:`add_token_classes` creates.
:param class_labels: Iterable of string class labels like "I-LOC"
:returns: A tuple of (IOB2 tags, entity type strings) corresponding
to the class labels.
"""
iobs = ["O" if t == "O" else t[:1] for t in class_labels]
types = [None if t == "O" else t.split("-")[1] for t in class_labels]
return iobs, types
[docs]def maybe_download_conll_data(target_dir: str) -> Dict[str, str]:
"""
Download and cache a copy of the CoNLL-2003 named entity recognition
data set.
**NOTE: This data set is licensed for research use only.**
Be sure to adhere to the terms of the license when using this data set!
:param target_dir: Directory where this function should write the corpus
files, if they are not already present.
:returns: Dictionary containing a mapping from fold name to file name for
each of the three folds (`train`, `test`, `dev`) of the corpus.
"""
_CONLL_DOWNLOAD_BASE_URL = (
"https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/"
"data/conll2003/"
)
_TRAIN_FILE_NAME = "eng.train"
_DEV_FILE_NAME = "eng.testa"
_TEST_FILE_NAME = "eng.testb"
_TRAIN_FILE = f"{target_dir}/{_TRAIN_FILE_NAME}"
_DEV_FILE = f"{target_dir}/{_DEV_FILE_NAME}"
_TEST_FILE = f"{target_dir}/{_TEST_FILE_NAME}"
def download_file(url, destination):
data = requests.get(url)
open(destination, "wb").write(data.content)
if not os.path.exists(_TRAIN_FILE):
download_file(_CONLL_DOWNLOAD_BASE_URL + _TRAIN_FILE_NAME, _TRAIN_FILE)
if not os.path.exists(_DEV_FILE):
download_file(_CONLL_DOWNLOAD_BASE_URL + _DEV_FILE_NAME, _DEV_FILE)
if not os.path.exists(_TEST_FILE):
download_file(_CONLL_DOWNLOAD_BASE_URL + _TEST_FILE_NAME, _TEST_FILE)
return {"train": _TRAIN_FILE, "dev": _DEV_FILE, "test": _TEST_FILE}
[docs]def maybe_download_dataset_data(
target_dir: str, document_url: str, fname: str = None
) -> Union[str, List[str]]:
"""
If the file found at the url is not found in the target directory,
downloads it, and saves it to that place in downloads.
Returns the path to the file. If a zip archive is downloaded, only files that are not already in the target
directory will be fetched, and if an alternate_name is given only that file will be operated on.
Note if a Zip archive is downloaded it will be unpacked so verify that the url being used is safe.
:param target_dir: Directory where this function should write the document
:param document_url: url from which to download the docuemnt. If no alternate name is specified,
it is assumed that the string after the last slash is the name of the file.
:param fname: if given, the name of the file that is checked in the target directory,
as well as what is used to save the file if no such file is found. If a zip file is downloaded, and a file of this
name exists in in the archive, only it will be extracted.
:returns: the path to the file, or None if downloading was not successful
If the file found at the url is not found in the target directory,
downloads it, and saves it to that place in downloads
"""
file_name = (
fname if fname is not None else document_url.split("/")[-1]
)
full_path = target_dir + "/" + file_name
# if no directory exists, create one
if not os.path.exists(target_dir):
os.mkdir(target_dir)
# special logic for zip files
if document_url.split(".")[-1] == "zip" and (
fname is None or not os.path.exists(full_path)
):
# if we have a zip file already, don't re-download it
zip_path = target_dir + "/" + document_url.split("/")[-1]
if not os.path.exists(zip_path):
data = requests.get(document_url)
open(zip_path, "wb").write(data.content)
# if need be, extract the zipfile documents
with ZipFile(zip_path, "r") as zipf:
fnames = zipf.namelist()
if fname is not None and fname in fnames:
zipf.extract(fname, target_dir)
return full_path
for fname in fnames:
if not os.path.exists(target_dir + fname):
zipf.extract(fname, target_dir)
if len(fnames) == 1:
full_path = target_dir + "/" + fnames[0]
else:
return [target_dir + "/" + fname for fname in fnames]
# regular logic
elif not os.path.exists(full_path):
data = requests.get(document_url)
open(full_path, "wb").write(data.content)
return full_path
def _prep_for_stacking(fold_name: str, doc_num: int, df: pd.DataFrame) -> pd.DataFrame:
"""
Subroutine of combine_folds()
"""
df_values = {
"fold": fold_name,
"doc_num": doc_num,
}
for colname in df.columns:
df_values[colname] = df[colname]
return pd.DataFrame(df_values)
[docs]def combine_folds(fold_to_docs: Dict[str, List[pd.DataFrame]]):
"""
Merge together multiple parts of a corpus (i.e. train, test, validation)
into a single DataFrame of all tokens in the corpus.
:param fold_to_docs: Mapping from fold name ("train", "test", etc.) to
list of per-document DataFrames as produced by :func:`util.conll_to_bert`.
All DataFrames must have the same schema, but any schema is ok.
:returns: corpus wide DataFrame with some additional leading columns `fold`
and `doc_num` to tell what fold and document number within the fold each
row of the dataframe comes from.
"""
to_stack = [] # Type: List[pd.DataFrame]
for fold_name, docs_in_fold in fold_to_docs.items():
to_stack.extend(
[
_prep_for_stacking(fold_name, i, docs_in_fold[i])
for i in range(len(docs_in_fold))
]
)
return pd.concat(to_stack).reset_index(drop=True)
[docs]def compute_accuracy_by_document(
corpus_dfs: Dict[Tuple[str, int], pd.DataFrame],
output_dfs: Dict[Tuple[str, int], pd.DataFrame],
) -> pd.DataFrame:
"""
Compute precision, recall, and F1 scores by document.
:param corpus_dfs: Gold-standard span/entity type pairs, as either:
* a dictionary of DataFrames, one DataFrames per document, indexed by
tuples of (collection name, offset into collection)
* a list of DataFrames, one per document
as returned by :func:`conll_2003_output_to_dataframes()`
:param output_dfs: Model outputs, in the same format as `gold_dfs`
(i.e. exactly the same column names). This is the format that
produces.
"""
if isinstance(corpus_dfs, list):
if not isinstance(output_dfs, list):
raise TypeError(
f"corpus_dfs is a list, but output_dfs is of type "
f"'{type(output_dfs)}', which is not a list."
)
corpus_dfs = {("", i): corpus_dfs[i] for i in range(len(corpus_dfs))}
output_dfs = {("", i): output_dfs[i] for i in range(len(output_dfs))}
# Note that it's important for all of these lists to be in the same
# order; hence these expressions all iterate over gold_dfs.keys()
num_true_positives = [
len(corpus_dfs[k].merge(output_dfs[k]).index) for k in corpus_dfs.keys()
]
num_extracted = [len(output_dfs[k].index) for k in corpus_dfs.keys()]
num_entities = [len(corpus_dfs[k].index) for k in corpus_dfs.keys()]
collection_name = [t[0] for t in corpus_dfs.keys()]
doc_num = [t[1] for t in corpus_dfs.keys()]
stats_by_doc = pd.DataFrame(
{
"fold": collection_name,
"doc_num": doc_num,
"num_true_positives": num_true_positives,
"num_extracted": num_extracted,
"num_entities": num_entities,
}
)
stats_by_doc["precision"] = (
stats_by_doc["num_true_positives"] / stats_by_doc["num_extracted"]
)
stats_by_doc["recall"] = (
stats_by_doc["num_true_positives"] / stats_by_doc["num_entities"]
)
stats_by_doc["F1"] = (
2.0
* (stats_by_doc["precision"] * stats_by_doc["recall"])
/ (stats_by_doc["precision"] + stats_by_doc["recall"])
)
return stats_by_doc
[docs]def compute_global_accuracy(stats_by_doc: pd.DataFrame):
"""
Compute collection-wide precision, recall, and F1 score from the
output of :func:`compute_f1_by_document`.
:param stats_by_doc: Output of :func:`make_stats_df`
:returns: A Python dictionary of collection-level statistics about
result quality.
"""
num_true_positives = stats_by_doc["num_true_positives"].sum()
num_entities = stats_by_doc["num_entities"].sum()
num_extracted = stats_by_doc["num_extracted"].sum()
precision = num_true_positives / num_extracted
recall = num_true_positives / num_entities
f1 = 2.0 * (precision * recall) / (precision + recall)
return {
"num_true_positives": num_true_positives,
"num_entities": num_entities,
"num_extracted": num_extracted,
"precision": precision,
"recall": recall,
"F1": f1,
}