Added static HTML generation to jupyter SpanArray renderer

CODAIT · Jun 30, 2021 · 106e964 · 106e964
1 parent 35e7165
commit 106e964
Showing 1 changed file with 197 additions and 4 deletions.
diff --git a/text_extensions_for_pandas/jupyter.py b/text_extensions_for_pandas/jupyter.py
@@ -150,16 +150,14 @@ def pretty_print_html(column: Union["SpanArray", "TokenSpanArray"],
  show_offset_string = 'true' if show_offsets else 'false'
 
  return textwrap.dedent(f"""
- <style class="span-array-css">
- {textwrap.indent(style_text, ' ')}
- </style>
  <script>
  {{
  {textwrap.indent(script_text, ' ')}
  }}
  </script>
  <div class="span-array">
- If you're reading this message, your notebook viewer does not support Javascript execution. Try pasting the URL into a service like nbviewer.
+ {_get_initial_static_html(column, show_offsets)}
+ <span style="font-size: 0.8em;color: #b3b3b3;">If you're reading this message, your notebook viewer does not support Javascript execution. Try pasting the URL into a service like nbviewer.</span>
  </div>
  <script>
  {{
@@ -173,3 +171,198 @@ def pretty_print_html(column: Union["SpanArray", "TokenSpanArray"],
  </script>
  {''.join(postfix_tags)}
  """)
+
+def _get_initial_static_html(column: Union["SpanArray", "TokenSpanArray"],
+ show_offsets: bool) -> str:
+ # Subroutine of pretty_print_html
+ # Gets the initial static html representation of the column for notebook viewers without javascript support.
+
+ # For each document
+ # render table
+ # calculate relationships
+ # get highlight regions
+ # render context
+
+ documents = column.split_by_document()
+ documents_html = []
+
+ for column_index in range(min(_DOCUMENT_DISPLAY_LIMIT, len(documents))):
+ document = documents[column_index]
+ table_rows_html = []
+ # table
+ for span in document:
+ table_rows_html.append(f"""
+ <tr>
+ <td></td>
+ <td></td>
+ <td>{span.begin}</td>
+ <td>{span.end}</td>
+ <td>{_get_sanitized_text(document.document_text[span.begin:span.end])}</td>
+ </tr>
+ """)
+ spans = {}
+
+ # Get span objects & relationships
+ for i in range(len(document)):
+
+ span_data = {}
+ span_data["id"] = i
+ span_data["begin"] = document[i].begin
+ span_data["end"] = document[i].end
+ span_data["sets"] = []
+
+ for j in range(i+1, len(document)):
+ # If the spans do not overlap, exit the sub-loop
+ if(document[j].begin >= document[i].end):
+ break
+ else:
+ if(document[j].end <= document[i].end):
+ span_data["sets"].append({"type": "nested", "id": j})
+ else:
+ span_data["sets"].append({"type": "overlap", "id": j})
+
+ spans[i] = span_data
+
+ # get mark regions
+ mark_regions = []
+
+ i = 0
+ while i < len(document):
+
+ region = {}
+ region["root_id"] = i
+ region["begin"] = spans[i]["begin"]
+
+ set_span = _get_set_span(spans, i)
+ region["end"] = set_span["end"]
+
+ if len(spans[i]["sets"]) > 0:
+ # get set span and type
+ if(_is_complex(spans, i)):
+ region["type"] = "complex"
+ else:
+ region["type"] = "nested"
+ else:
+ region["type"] = "solo"
+ mark_regions.append(region)
+
+ i = set_span["highest_id"] + 1
+
+ # generate the context segments
+ context_html = []
+
+ if len(mark_regions) == 0:
+ context_html.append(_get_sanitized_text(document.document_text))
+ else:
+ snippet_begin = 0
+ for region in mark_regions:
+ context_html.append(f"""
+ {_get_sanitized_text(document.document_text[snippet_begin:region["begin"]])}
+ """)
+
+ if region["type"] == "complex":
+ context_html.append(f"""
+ <mark class='complex-set'>{_get_sanitized_text(document.document_text[region["begin"]:region["end"]])}<span class='mark-tag'>Set</span></mark>
+ """)
+
+ elif region["type"] == "nested":
+ mark_html = []
+ nested_snippet_begin = region["begin"]
+ # Iterate over each span nested within the root span of the mark region
+ for nested_span in map(lambda set: spans[set["id"]], spans[region["root_id"]]["sets"]):
+ mark_html.append(f"""
+ {_get_sanitized_text(document.document_text[nested_snippet_begin:nested_span["begin"]])}
+ <mark>{_get_sanitized_text(document.document_text[nested_span["begin"]:nested_span["end"]])}</mark>
+ """)
+ nested_snippet_begin = nested_span["end"]
+ context_html.append(f"""
+ <mark>{"".join(mark_html)}</mark>
+ """)
+
+ elif region["type"] == "solo":
+ context_html.append(f"""
+ <mark>{_get_sanitized_text(document.document_text[region["begin"]:region["end"]])}</mark>
+ """)
+
+ snippet_begin = region["end"]
+
+ # Generate the document's HTML template
+ documents_html.append(f"""
+ <div class='document'>
+ <table>
+ <thead><tr>
+ <th></th>
+ <th></th>
+ <th>begin</th>
+ <th>end</th>
+ <th>context</th>
+ </tr></thead>
+ <tbody>
+ {"".join(table_rows_html)}
+ </tbody>
+ </table>
+ <p>
+ {"".join(context_html)}
+ </p>
+ </div>
+ """)
+
+ # Concat and return the final HTML string
+ return "".join(documents_html)
+
+def _get_set_span(spans: Dict, id: int) -> Dict:
+ # Subroutine of _get_initial_static_html
+ # Recursive algorithm to get the last end and ID values of the set of spans connected to span with the given ID
+ # Will raise a KeyError exception if an invalid key is given
+
+ end = spans[id]["end"]
+ highest_id = id
+
+ # For each span in the set of spans, get the return values and take the largest end and highest ID
+ for set in spans[id]["sets"]:
+ other = _get_set_span(spans, set["id"])
+ if other["end"] > end:
+ end = other["end"]
+ if other["highest_id"] > highest_id:
+ highest_id = other["highest_id"]
+
+ return {"end": end, "highest_id": highest_id}
+
+def _is_complex(spans: Dict, id: int) -> bool:
+ # Subroutine of _get_initial_static_html
+ # If any connection sets are of type:overlap or nested beyond a depth of 1, return True
+ # Will raise a KeyError exception if an invalid key is given
+
+ for set in spans[id]["sets"]:
+ if set["type"] == "overlap":
+ return True
+ elif set["type"] == "nested":
+ if len(spans[set["id"]]["sets"]) > 0:
+ return True
+ return False
+
+def _get_sanitized_text(text: str) -> str:
+ # Subroutine of _get_initial_static_html
+ # Returns a string with HTML reserved character replacements to avoid issues while rendering text as HTML
+
+ text_pieces = []
+ for i in range(len(text)):
+ if text[i] == "&":
+ text_pieces.append("&amp;")
+ elif text[i] == "<":
+ text_pieces.append("&lt;")
+ elif text[i] == ">":
+ text_pieces.append("&gt;")
+ elif text[i] == "\"":
+ # Not strictly necessary, but just in case.
+ text_pieces.append("&quot;")
+ elif text[i] == "'":
+ # Not strictly necessary, but just in case.
+ text_pieces.append("&#39;")
+ elif text[i] == "$":
+ # Dollar sign messes up Jupyter's JavaScript UI.
+ # Place dollar sign in its own sub-span to avoid being misinterpeted as a LaTeX delimiter
+ text_pieces.append("<span>&#36;</span>")
+ else:
+ text_pieces.append(text[i])
+ return "".join(text_pieces)