Skip to content

Commit

Permalink
Added static HTML generation to jupyter SpanArray renderer
Browse files Browse the repository at this point in the history
  • Loading branch information
PokkeFe committed Jun 30, 2021
1 parent 35e7165 commit 106e964
Showing 1 changed file with 197 additions and 4 deletions.
201 changes: 197 additions & 4 deletions text_extensions_for_pandas/jupyter.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,14 @@ def pretty_print_html(column: Union["SpanArray", "TokenSpanArray"],
show_offset_string = 'true' if show_offsets else 'false'

return textwrap.dedent(f"""
<style class="span-array-css">
{textwrap.indent(style_text, ' ')}
</style>
<script>
{{
{textwrap.indent(script_text, ' ')}
}}
</script>
<div class="span-array">
If you're reading this message, your notebook viewer does not support Javascript execution. Try pasting the URL into a service like nbviewer.
{_get_initial_static_html(column, show_offsets)}
<span style="font-size: 0.8em;color: #b3b3b3;">If you're reading this message, your notebook viewer does not support Javascript execution. Try pasting the URL into a service like nbviewer.</span>
</div>
<script>
{{
Expand All @@ -173,3 +171,198 @@ def pretty_print_html(column: Union["SpanArray", "TokenSpanArray"],
</script>
{''.join(postfix_tags)}
""")

def _get_initial_static_html(column: Union["SpanArray", "TokenSpanArray"],
show_offsets: bool) -> str:
# Subroutine of pretty_print_html
# Gets the initial static html representation of the column for notebook viewers without javascript support.

# For each document
# render table
# calculate relationships
# get highlight regions
# render context

documents = column.split_by_document()
documents_html = []

for column_index in range(min(_DOCUMENT_DISPLAY_LIMIT, len(documents))):
document = documents[column_index]
table_rows_html = []
# table
for span in document:
table_rows_html.append(f"""
<tr>
<td></td>
<td></td>
<td>{span.begin}</td>
<td>{span.end}</td>
<td>{_get_sanitized_text(document.document_text[span.begin:span.end])}</td>
</tr>
""")
spans = {}

# Get span objects & relationships
for i in range(len(document)):

span_data = {}
span_data["id"] = i
span_data["begin"] = document[i].begin
span_data["end"] = document[i].end
span_data["sets"] = []

for j in range(i+1, len(document)):
# If the spans do not overlap, exit the sub-loop
if(document[j].begin >= document[i].end):
break
else:
if(document[j].end <= document[i].end):
span_data["sets"].append({"type": "nested", "id": j})
else:
span_data["sets"].append({"type": "overlap", "id": j})

spans[i] = span_data

# get mark regions
mark_regions = []

i = 0
while i < len(document):

region = {}
region["root_id"] = i
region["begin"] = spans[i]["begin"]

set_span = _get_set_span(spans, i)
region["end"] = set_span["end"]

if len(spans[i]["sets"]) > 0:
# get set span and type
if(_is_complex(spans, i)):
region["type"] = "complex"
else:
region["type"] = "nested"
else:
region["type"] = "solo"
mark_regions.append(region)

i = set_span["highest_id"] + 1

# generate the context segments
context_html = []

if len(mark_regions) == 0:
context_html.append(_get_sanitized_text(document.document_text))
else:
snippet_begin = 0
for region in mark_regions:
context_html.append(f"""
{_get_sanitized_text(document.document_text[snippet_begin:region["begin"]])}
""")

if region["type"] == "complex":
context_html.append(f"""
<mark class='complex-set'>{_get_sanitized_text(document.document_text[region["begin"]:region["end"]])}<span class='mark-tag'>Set</span></mark>
""")

elif region["type"] == "nested":
mark_html = []
nested_snippet_begin = region["begin"]
# Iterate over each span nested within the root span of the mark region
for nested_span in map(lambda set: spans[set["id"]], spans[region["root_id"]]["sets"]):
mark_html.append(f"""
{_get_sanitized_text(document.document_text[nested_snippet_begin:nested_span["begin"]])}
<mark>{_get_sanitized_text(document.document_text[nested_span["begin"]:nested_span["end"]])}</mark>
""")
nested_snippet_begin = nested_span["end"]
context_html.append(f"""
<mark>{"".join(mark_html)}</mark>
""")

elif region["type"] == "solo":
context_html.append(f"""
<mark>{_get_sanitized_text(document.document_text[region["begin"]:region["end"]])}</mark>
""")

snippet_begin = region["end"]

# Generate the document's HTML template
documents_html.append(f"""
<div class='document'>
<table>
<thead><tr>
<th></th>
<th></th>
<th>begin</th>
<th>end</th>
<th>context</th>
</tr></thead>
<tbody>
{"".join(table_rows_html)}
</tbody>
</table>
<p>
{"".join(context_html)}
</p>
</div>
""")

# Concat and return the final HTML string
return "".join(documents_html)

def _get_set_span(spans: Dict, id: int) -> Dict:
# Subroutine of _get_initial_static_html
# Recursive algorithm to get the last end and ID values of the set of spans connected to span with the given ID
# Will raise a KeyError exception if an invalid key is given

end = spans[id]["end"]
highest_id = id

# For each span in the set of spans, get the return values and take the largest end and highest ID
for set in spans[id]["sets"]:
other = _get_set_span(spans, set["id"])
if other["end"] > end:
end = other["end"]
if other["highest_id"] > highest_id:
highest_id = other["highest_id"]

return {"end": end, "highest_id": highest_id}

def _is_complex(spans: Dict, id: int) -> bool:
# Subroutine of _get_initial_static_html
# If any connection sets are of type:overlap or nested beyond a depth of 1, return True
# Will raise a KeyError exception if an invalid key is given

for set in spans[id]["sets"]:
if set["type"] == "overlap":
return True
elif set["type"] == "nested":
if len(spans[set["id"]]["sets"]) > 0:
return True
return False

def _get_sanitized_text(text: str) -> str:
# Subroutine of _get_initial_static_html
# Returns a string with HTML reserved character replacements to avoid issues while rendering text as HTML

text_pieces = []
for i in range(len(text)):
if text[i] == "&":
text_pieces.append("&amp;")
elif text[i] == "<":
text_pieces.append("&lt;")
elif text[i] == ">":
text_pieces.append("&gt;")
elif text[i] == "\"":
# Not strictly necessary, but just in case.
text_pieces.append("&quot;")
elif text[i] == "'":
# Not strictly necessary, but just in case.
text_pieces.append("&#39;")
elif text[i] == "$":
# Dollar sign messes up Jupyter's JavaScript UI.
# Place dollar sign in its own sub-span to avoid being misinterpeted as a LaTeX delimiter
text_pieces.append("<span>&#36;</span>")
else:
text_pieces.append(text[i])
return "".join(text_pieces)

0 comments on commit 106e964

Please sign in to comment.