Source code for text_extensions_for_pandas.jupyter.span

#
#  Copyright (c) 2021 IBM Corp.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

#
# span.py
#
# Part of text_extensions_for_pandas
#
# Support for span-centric Jupyter rendering and utilities
#

import textwrap
from typing import *
from enum import Enum
import text_extensions_for_pandas.resources

# TODO: This try/except block is for Python 3.6 support, and should be
# reduced to just importing importlib.resources when 3.6 support is dropped.
try:
    import importlib.resources as pkg_resources
except ImportError:
    import importlib_resources as pkg_resources


# Limits the max number of displayed documents. Matches Pandas' default display.max_seq_items.
_DOCUMENT_DISPLAY_LIMIT = 100


class SetType(Enum):
    NESTED=1
    OVERLAP=2

class RegionType(Enum):
    NESTED=1
    COMPLEX=2
    SOLO=3


[docs]def pretty_print_html(column: Union["SpanArray", "TokenSpanArray"], show_offsets: bool) -> str: """ HTML pretty-printing of a series of spans for Jupyter notebooks. Args: column: Span column (either character or token spans). show_offsets: True to generate a table of span offsets in addition to the marked-up text """ # Local import to prevent circular dependencies from text_extensions_for_pandas.array.span import SpanArray from text_extensions_for_pandas.array.token_span import TokenSpanArray if not isinstance(column, (SpanArray, TokenSpanArray)): raise TypeError(f"Expected SpanArray or TokenSpanArray, but received " f"{column} of type {type(column)}") # Gets the main script and stylesheet from the 'resources' sub-package style_text: str = pkg_resources.read_text(text_extensions_for_pandas.resources, "span_array.css") script_text: str = pkg_resources.read_text(text_extensions_for_pandas.resources, "span_array.js") # Declare initial variables common to all render calls instance_init_script_list: List[str] = [] # For each document, pass the array of spans and document text into the script's render function document_columns = column.split_by_document() for column_index in range(min(_DOCUMENT_DISPLAY_LIMIT, len(document_columns))): # Get a javascript representation of the column span_array = [] token_span_array = [] for e in document_columns[column_index]: span_array.append(f"""[{e.begin},{e.end}]""") if hasattr(e, "tokens"): token_span_array.append(f"""[{e.begin_token},{e.end_token}]""") document_object_script = f""" const doc_spans = [{','.join(span_array)}] const doc_text = '{_get_escaped_doctext(document_columns[column_index])}' """ # If the documents are a TokenSpanArray, include the start and end token indices in the document object. if len(token_span_array) > 0: document_object_script += f""" const doc_token_spans = [{','.join(token_span_array)}] documents.push({{doc_text: doc_text, doc_spans: doc_spans, doc_token_spans: doc_token_spans}}) """ else: document_object_script += """ documents.push({doc_text: doc_text, doc_spans: doc_spans}) """ instance_init_script_list.append(f""" {{ {document_object_script} }} """) # Defines a list of DOM strings to be appended to the end of the returned HTML. postfix_tags: List[str] = [] if len(document_columns) > _DOCUMENT_DISPLAY_LIMIT: postfix_tags.append(f""" <footer>Documents truncated. Showing {_DOCUMENT_DISPLAY_LIMIT} of {len(document_columns)}</footer> """) # Get the show_offsets parameter as a JavaScript boolean show_offset_string = 'true' if show_offsets else 'false' return textwrap.dedent(f""" <style class="span-array-css"> {textwrap.indent(style_text, ' ')} </style> <script> {{ {textwrap.indent(script_text, ' ')} }} </script> <div class="span-array"> {_get_initial_static_html(column, show_offsets)} <span style="font-size: 0.8em;color: #b3b3b3;">Your notebook viewer does not support Javascript execution. The above rendering will not be interactive.</span> </div> <script> {{ const Span = window.SpanArray.Span const script_context = document.currentScript const documents = [] {''.join(instance_init_script_list)} const instance = new window.SpanArray.SpanArray(documents, {show_offset_string}, script_context) instance.render() }} </script> {''.join(postfix_tags)} """)
def _get_escaped_doctext(column: Union["SpanArray", "TokenSpanArray"]) -> List[str]: # Subroutine of pretty_print_html() above. # Should only be called for single-document span arrays. if not column.is_single_document: raise ValueError("Array contains spans from multiple documents. Can only " "render one document at a time.") text = column.document_text text_pieces = [] for i in range(len(text)): if text[i] == "'": text_pieces.append("\\'") elif text[i] == "\n": text_pieces.append("\\n") else: text_pieces.append(text[i]) return "".join(text_pieces) def _get_initial_static_html(column: Union["SpanArray", "TokenSpanArray"], show_offsets: bool) -> str: # Subroutine of pretty_print_html above. # Gets the initial static html representation of the column for notebook viewers without JavaScript support. # Iterates over each document and constructs the DOM string with template literals. # ! Text inserted into the DOM as raw HTML should always be sanitized to prevent unintended DOM manipulation # and XSS attacks. documents = column.split_by_document() documents_html = [] for column_index in range(min(_DOCUMENT_DISPLAY_LIMIT, len(documents))): document = documents[column_index] # Generate a dictionary to store span information, including relationships with spans occupying the same region. spans = {} is_token_document = False sorted_span_ids = [] for i in range(len(document)): span_data = {} span_data["id"] = i span_data["begin"] = document[i].begin span_data["end"] = document[i].end if hasattr(document[i], "tokens"): is_token_document = True span_data["begin_token"] = document[i].begin_token span_data["end_token"] = document[i].end_token span_data["sets"] = [] spans[i] = span_data sorted_span_ids.append(i) # Sort IDs sorted_span_ids.sort(key=lambda id: (spans[id]["begin"], -spans[id]["end"])) for i in range(len(sorted_span_ids)): span_data = spans[sorted_span_ids[i]] for j in range(i+1, len(sorted_span_ids)): sub_span_data = spans[sorted_span_ids[j]] # If the spans do not overlap, exit the sub-loop if(sub_span_data["begin"] >= span_data["end"]): break else: if(sub_span_data["end"] <= span_data["end"]): span_data["sets"].append({"type": SetType.NESTED, "id": sub_span_data["id"]}) else: span_data["sets"].append({"type": SetType.OVERLAP, "id": sub_span_data["id"]}) spans[sorted_span_ids[i]] = span_data # Generate the table rows DOM string from span data. table_rows_html = [] for i in range(len(spans)): span = spans[i] table_rows_html.append(f""" <tr> <td><b>{span["id"]}</b></td> <td>{span["begin"]}</td> <td>{span["end"]}</td> """) if is_token_document: table_rows_html.append(f""" <td>{span["begin_token"]}</td> <td>{span["end_token"]}</td> """) table_rows_html.append(f""" <td>{_get_sanitized_text(document.document_text[span["begin"]:span["end"]])}</td> </tr> """) # Generate the regions of the document_text to highlight from span data. mark_regions = [] i = 0 while i < len(document): region = {} region["root_id"] = i region["begin"] = spans[i]["begin"] set_span = _get_set_span(spans, i) region["end"] = set_span["end"] if len(spans[i]["sets"]) > 0: # get set span and type if(_is_complex(spans, i)): region["type"] = RegionType.COMPLEX else: region["type"] = RegionType.NESTED else: region["type"] = RegionType.SOLO mark_regions.append(region) i = set_span["highest_id"] + 1 # Generate the document_text DOM string from the regions created above. context_html = [] if len(mark_regions) == 0: # There are no marked regions. Just append the sanitized text as a raw string. context_html.append(_get_sanitized_text(document.document_text)) else: # Iterate over each marked region and contruct the HTML for preceding text and marked text. # Then, append that HTML to the list of DOM strings for the document_text. snippet_begin = 0 for region in mark_regions: context_html.append(f""" {_get_sanitized_text(document.document_text[snippet_begin:region["begin"]])} """) if region["type"] == RegionType.COMPLEX: context_html.append(f""" <span class='mark btn-info complex-set' style=' padding:0.4em; border-radius:0.35em; background:linear-gradient(to right, #a0c4ff, #ffadad); color: black; '>{_get_sanitized_text(document.document_text[region["begin"]:region["end"]])} <span class='mark-tag' style=' font-weight: bolder; font-size: 0.8em; font-variant: small-caps; font-variant-caps: small-caps; font-variant-caps: all-small-caps; margin-left: 8px; text-transform: uppercase; color: black; '>Set</span> </span> """) elif region["type"] == RegionType.NESTED: mark_html = [] nested_snippet_begin = region["begin"] # Iterate over each span nested within the root span of the marked region for nested_span in map( \ lambda set: spans[set["id"]], spans[region["root_id"]]["sets"]): mark_html.append(f""" {_get_sanitized_text(document.document_text[nested_snippet_begin:nested_span["begin"]])} <span class='mark btn-warning' style=' padding:0.2em 0.4em; border-radius:0.35em; background-color: #ffadad; color: black; '>{_get_sanitized_text(document.document_text[nested_span["begin"]:nested_span["end"]])}</span> """) nested_snippet_begin = nested_span["end"] mark_html.append(_get_sanitized_text(document.document_text[nested_snippet_begin:region["end"]])) context_html.append(f""" <span class='mark btn-primary' style='padding:0.4em;border-radius:0.35em;background-color: #a0c4ff;color:black;'>{"".join(mark_html)}</span> """) elif region["type"] == RegionType.SOLO: context_html.append(f""" <span class='mark btn-primary' style='padding:0.4em;border-radius:0.35em;background-color: #a0c4ff;color:black;'>{_get_sanitized_text(document.document_text[region["begin"]:region["end"]])}</span> """) snippet_begin = region["end"] context_html.append(_get_sanitized_text(document.document_text[snippet_begin:])) # Generate the document's DOM string documents_html.append(f""" <div class='document'> <table style=' table-layout: auto; overflow: hidden; width: 100%; border-collapse: collapse; '> <thead style='font-variant-caps: all-petite-caps;'> <th></th> <th>begin</th> <th>end</th> {"<th>begin token</th><th>end token</th>" if is_token_document else ""} <th style='text-align:right;width:100%'>context</th> </tr></thead> <tbody> {"".join(table_rows_html)} </tbody> </table> <p style=' padding: 1em; line-height: calc(var(--jp-content-line-height, 1.6) * 1.6); '> {"".join(context_html)} </p> </div> """) # Concat all documents and return the final DOM string return "".join(documents_html) def _get_set_span(spans: Dict, id: int) -> Dict: # Subroutine of _get_initial_static_html() above. # Recursive algorithm to get the last end and ID values of the set of spans connected to span with the given ID # Will raise a KeyError exception if an invalid key is given end = spans[id]["end"] highest_id = id # For each span in the set of spans, get the return values and track the greatest endpoint index and ID values. for set in spans[id]["sets"]: other = _get_set_span(spans, set["id"]) if other["end"] > end: end = other["end"] if other["highest_id"] > highest_id: highest_id = other["highest_id"] return {"end": end, "highest_id": highest_id} def _is_complex(spans: Dict, id: int) -> bool: # Subroutine of _get_initial_static_html() above. # Returns True if the provided span should be considered a "Complex" span. Implementation details below. # Will raise a KeyError exception if an invalid key is given # If any connection sets are of type:overlap or nested beyond a depth of 1, return True for set in spans[id]["sets"]: if set["type"] == SetType.OVERLAP: return True elif set["type"] == SetType.NESTED: if len(spans[set["id"]]["sets"]) > 0: return True return False def _get_sanitized_text(text: str) -> str: # Subroutine of _get_initial_static_html() above. # Returns a string with HTML reserved character replacements to avoid issues while rendering text as HTML text_pieces = [] for i in range(len(text)): if text[i] == "&": text_pieces.append("&amp;") elif text[i] == "<": text_pieces.append("&lt;") elif text[i] == ">": text_pieces.append("&gt;") elif text[i] == "\"": # Not strictly necessary, but just in case. text_pieces.append("&quot;") elif text[i] == "'": # Not strictly necessary, but just in case. text_pieces.append("&#39;") elif text[i] == "$": # Dollar sign messes up Jupyter's JavaScript UI. # Place dollar sign in its own sub-span to avoid being misinterpeted as a LaTeX delimiter text_pieces.append("<span>&#36;</span>") elif text[i] == "\n" or text[i] == "\r": # Support for in-document newlines by replacing with line break elements text_pieces.append("<br>") else: text_pieces.append(text[i]) return "".join(text_pieces)