Skip to content
This repository was archived by the owner on Aug 12, 2022. It is now read-only.

Commit 84197c1

Browse files
authored
Improve performances of the StringSearchService with HTML resources (#162)
`StringSearchService` was sometimes performing very slowly on some HTML resources. The bottleneck was in the parsing from SwiftSoup (third-party dependency). To solve this, we now attempt to parse the resource as strict XML first and fallback on SwiftSoup only when the HTML resource is not a valid XML. As EPUB resources are supposed to be valid XHTML, this improves performances of EPUB search.
1 parent 25ce548 commit 84197c1

File tree

4 files changed

+39
-6
lines changed

4 files changed

+39
-6
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ jobs:
1313
env:
1414
scheme: ${{ 'r2-shared-swift' }}
1515
platform: ${{ 'iOS Simulator' }}
16+
device: ${{ 'iPhone 13' }}
1617

1718
steps:
1819
- name: Checkout
@@ -30,9 +31,7 @@ jobs:
3031
rm -rf r2-shared-swift.xcodeproj
3132
- name: Build
3233
run: |
33-
device=`xcrun xctrace list devices 2>&1 | grep -oE 'iPhone.*?[^\(]+' | head -1 | awk '{$1=$1;print}'`
3434
xcodebuild build-for-testing -scheme "$scheme" -destination "platform=$platform,name=$device"
3535
- name: Test
3636
run: |
37-
device=`xcrun xctrace list devices 2>&1 | grep -oE 'iPhone.*?[^\(]+' | head -1 | awk '{$1=$1;print}'`
3837
xcodebuild test-without-building -scheme "$scheme" -destination "platform=$platform,name=$device"

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ All notable changes to this project will be documented in this file.
1010

1111
* Support for Paragraph Margins user setting.
1212

13+
### Fixed
14+
15+
* Improved performances of the search service used with EPUB.
16+
17+
1318
## [2.1.0]
1419

1520
### Added

r2-shared-swift/Fetcher/Resource/ResourceContentExtractor.swift

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,47 @@ public class _DefaultResourceContentExtractorFactory: _ResourceContentExtractorF
5454
/// notice. Use with caution.
5555
class _HTMLResourceContentExtractor: _ResourceContentExtractor {
5656

57+
private let xmlFactory = DefaultXMLDocumentFactory()
58+
5759
func extractText(of resource: Resource) -> ResourceResult<String> {
5860
resource.readAsString()
59-
.flatMap { html in
61+
.flatMap { content in
6062
do {
61-
var text = try SwiftSoup.parse(html).body()?.text() ?? ""
63+
// First try to parse a valid XML document, then fallback on SwiftSoup, which is slower.
64+
var text = parse(xml: content)
65+
?? parse(html: content)
66+
?? ""
67+
6268
// Transform HTML entities into their actual characters.
6369
text = try Entities.unescape(text)
70+
6471
return .success(text)
6572

6673
} catch {
6774
return .failure(.wrap(error))
6875
}
6976
}
7077
}
71-
72-
}
78+
79+
// Parse the HTML resource as a strict XML document.
80+
//
81+
// This is much more efficient than using SwiftSoup, but will fail when encountering
82+
// invalid HTML documents.
83+
private func parse(xml: String) -> String? {
84+
guard let document = try? xmlFactory.open(string: xml, namespaces: [
85+
XMLNamespace(prefix: "xhtml", uri: "http://www.w3.org/1999/xhtml")
86+
])
87+
else {
88+
return nil
89+
}
90+
91+
return document.first("/xhtml:html/xhtml:body")?.textContent
92+
}
93+
94+
// Parse the HTML resource with SwiftSoup.
95+
//
96+
// This may be slow but will recover from broken HTML documents.
97+
private func parse(html: String) -> String? {
98+
return try? SwiftSoup.parse(html).body()?.text()
99+
}
100+
}

r2-shared-swift/Toolkit/XML/Fuzi.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ final class FuziXMLDocument: XMLDocument, Loggable {
1818

1919
init(string: String, namespaces: [XMLNamespace]) throws {
2020
self.document = try Fuzi.XMLDocument(string: string)
21+
document.definePrefixes(namespaces)
2122
}
2223

2324
lazy var documentElement: XMLElement? =

0 commit comments

Comments
 (0)