Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decode html entities #71

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
81 changes: 81 additions & 0 deletions Sources/OpenGraph/Extension/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,85 @@ extension String {

self.init(data: data, encoding: encoding)
}

//Retreived from: https://stackoverflow.com/a/30141700
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// Mapping from XML/HTML character entity reference to character
// From http:https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
let characterEntities : [ Substring : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"&lt;" : "<",
"&gt;" : ">",

// HTML character entity references:
"&nbsp;" : "\u{00a0}",
// ...
"&diams;" : "♦",
]

// ===== Utility functions =====

// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : Substring, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}

// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("&#64;") --> "@"
// decode("&#x20ac;") --> "€"
// decode("&lt;") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : Substring) -> Character? {

if entity.hasPrefix("&#x") || entity.hasPrefix("&#X") {
return decodeNumeric(entity.dropFirst(3).dropLast(), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.dropFirst(2).dropLast(), base: 10)
} else {
return characterEntities[entity]
}
}

// ===== Method starts here =====

var result = ""
var position = startIndex

// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self[position...].range(of: "&") {
result.append(contentsOf: self[position ..< ampRange.lowerBound])
position = ampRange.lowerBound

// Find the next ';' and copy everything from '&' to ';' into `entity`
guard let semiRange = self[position...].range(of: ";") else {
// No matching ';'.
break
}
let entity = self[position ..< semiRange.upperBound]
position = semiRange.upperBound

if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(contentsOf: entity)
}
}
// Copy remaining characters to `result`:
result.append(contentsOf: self[position...])
return result
}
}
2 changes: 1 addition & 1 deletion Sources/OpenGraph/OpenGraphParser.swift
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ extension OpenGraphParser {

let nsMetaTag = metaTag as NSString
let property = nsMetaTag.substring(with: propertyResult.range(at: 1))
let content = nsMetaTag.substring(with: contentResult.range(at: 1))
let content = nsMetaTag.substring(with: contentResult.range(at: 1)).stringByDecodingHTMLEntities

return (name: property, content: content)
}()
Expand Down