Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decode html entities #71

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Added string extension method to decode html entities
  • Loading branch information
martinmaly21 committed Mar 4, 2023
commit 53bd43a18518e45bb1bd3cd90d4e4a03f9f7f13f
81 changes: 81 additions & 0 deletions Sources/OpenGraph/Extension/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,85 @@ extension String {

self.init(data: data, encoding: encoding)
}

//Retreived from: https://stackoverflow.com/a/30141700
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// Mapping from XML/HTML character entity reference to character
// From http:https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
let characterEntities : [ Substring : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"&lt;" : "<",
"&gt;" : ">",

// HTML character entity references:
"&nbsp;" : "\u{00a0}",
// ...
"&diams;" : "♦",
]

// ===== Utility functions =====

// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : Substring, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}

// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("&#64;") --> "@"
// decode("&#x20ac;") --> "€"
// decode("&lt;") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : Substring) -> Character? {

if entity.hasPrefix("&#x") || entity.hasPrefix("&#X") {
return decodeNumeric(entity.dropFirst(3).dropLast(), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.dropFirst(2).dropLast(), base: 10)
} else {
return characterEntities[entity]
}
}

// ===== Method starts here =====

var result = ""
var position = startIndex

// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self[position...].range(of: "&") {
result.append(contentsOf: self[position ..< ampRange.lowerBound])
position = ampRange.lowerBound

// Find the next ';' and copy everything from '&' to ';' into `entity`
guard let semiRange = self[position...].range(of: ";") else {
// No matching ';'.
break
}
let entity = self[position ..< semiRange.upperBound]
position = semiRange.upperBound

if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(contentsOf: entity)
}
}
// Copy remaining characters to `result`:
result.append(contentsOf: self[position...])
return result
}
}