-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathmain.swift
More file actions
164 lines (134 loc) · 4.47 KB
/
main.swift
File metadata and controls
164 lines (134 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
//
// main.swift
// HTMLKitExample
//
// Created by Iska on 27/09/16.
// Copyright © 2016 iabudiab. All rights reserved.
//
import HTMLKit
// Simple scraper that is able to load a page, query via CSS Selectors, and following links
class Scraper {
enum ScrapingError: Error {
case DocumentNotLoaded
case ElementNotFound(String)
case InvalidAnchorUrl(String)
case CouldNotLoadPage(URL)
}
private var url: URL
private(set) var document: HTMLDocument?
init(url: URL) {
self.url = url
self.document = nil
}
func load() throws {
try loadDocument(at: url)
}
func listElements(matching selector: CSSSelector) throws -> [HTMLElement] {
guard let document = document else {
throw ScrapingError.DocumentNotLoaded
}
return document.elements(matching: selector)
}
func followLink(matchingSelector selector: CSSSelector) throws {
guard let document = document else {
throw ScrapingError.DocumentNotLoaded
}
guard let link = document.firstElement(matching: selector) else {
throw ScrapingError.ElementNotFound(selector.debugDescription)
}
guard let targetUrl = URL(string: link["href"], relativeTo: url) else {
throw ScrapingError.InvalidAnchorUrl(link["href"])
}
try loadDocument(at: targetUrl)
}
private func loadDocument(at url: URL) throws {
guard let content = try? String(contentsOf: url) else {
throw ScrapingError.CouldNotLoadPage(url)
}
document = HTMLDocument(string: content)
}
}
// A custom block-based selector, that matches only elements having the given text content:
// i.e. textContentSelector("Hello") will match <p>Hello</p> and <a href='example.com'>Hello</a>
// but wont match <div>World</div> or <p>Hello there</p>
func textContentSelector(text: String) -> CSSSelector {
return namedBlockSelector("[@textContent='\(text)']") { (element) -> Bool in
return element.textContent == text
}
}
// Helper function to create a typed-selector matching an anchor element that has the given
// text content.
func anchorElement(havingContent: String) -> CSSSelector {
return allOf(
[
typeSelector("a"),
textContentSelector(text: havingContent)
]
)
}
// Helper function to print the content of a github repository file content
func printRepositoryFile(element: HTMLElement) {
// A node iterator filter that iterates only <td> elements of class "content" i.e. <td class='content'>
let contentIterator = element.nodeIterator(showOptions: .element) { (node) -> HTMLNodeFilterValue in
guard let element = node as? HTMLElement else { return .reject }
if element.tagName == "td" && element["class"] == "content" {
return .accept
}
return .reject
}
for td in contentIterator {
// The cast is necessary because Swift3 wont import the generics info of the NSEnumerator class
// i.e. the nextObject() function alwasy has the following signature 'func nextObject() -> Any?'
let title = (td as AnyObject).textContent.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
print("- \(title)")
}
}
let htmlKitUrl = URL(string: "https://github.com/iabudiab/HTMLKit")!
let scraper = Scraper(url: htmlKitUrl)
do {
// Load the page
try scraper.load()
// Parse the selector
let repositoryContent = try CSSSelectorParser.parseSelector(".repository-content > .file-wrap > table.files tr.js-navigation-item")
// Query matching elements
let files = try scraper.listElements(matching: repositoryContent)
print("HTMLKit repositroy root:")
files.forEach(printRepositoryFile)
} catch let error {
print(error)
}
do {
// Follow some links
try scraper.followLink(matchingSelector: anchorElement(havingContent: "Sources"))
try scraper.followLink(matchingSelector: anchorElement(havingContent: "HTMLEOFToken.m"))
// The following selector: "[role='main'] div.file table.js-file-line-container td:nth-child(2)"
// can be defined in type-safe manner:
let selector = allOf([
descendantOfElementSelector(
allOf([
typeSelector("div"),
classSelector("repository-content")
])
),
descendantOfElementSelector(
allOf([
typeSelector("table"),
classSelector("js-file-line-container")
])
),
typeSelector("td"),
nthChildSelector(
CSSNthExpressionMake(0, 2)
)
])
// Query matching elements
let elements = try scraper.listElements(matching: selector)
// This will print the source code for the "HTMLEOFToken.m" file under this url:
// https://github.com/iabudiab/HTMLKit/blob/master/Sources/HTMLEOFToken.m
print("\nHTMLEOFToken:")
elements.forEach {
print($0.textContent)
}
} catch let error {
print(error)
}