-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawl.test.js
74 lines (66 loc) · 2.8 KB
/
crawl.test.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
const { test, expect } = require('@jest/globals');
const { normalizeURL, getURLsFromHTML } = require('./crawl.js');
test('normalizeURL should only return a hostname/pathname string', () => {
const input = "https://blog.tanjaschmidt.com/path";
const actual = normalizeURL(input);
const expected = "blog.tanjaschmidt.com/path";
expect(actual).toEqual(expected);
});
test('normalizeURL should trim a slash at the end', () => {
const input = "https://blog.tanjaschmidt.com/path/";
const actual = normalizeURL(input);
const expected = "blog.tanjaschmidt.com/path";
expect(actual).toEqual(expected);
});
test('normalizeURL should turn uppercase letters to lowercase', () => {
const input = "https://blog.TANJASCHMIDT.com/PATH";
const actual = normalizeURL(input);
const expected = "blog.tanjaschmidt.com/path";
expect(actual).toEqual(expected);
});
test('normalizeURL should work with a regular HTTP URL too', () => {
const input = "http://blog.tanjaschmidt.com/path";
const actual = normalizeURL(input);
const expected = "blog.tanjaschmidt.com/path";
expect(actual).toEqual(expected);
});
test('getURLsFromHTML should return an array of URLs from absolute URLs', () => {
const inputHTML =
`<html>
<body>
<a href="https://blog.tanjaschmidt.com/path">Blog</a>
<a href="https://blog.tanjaschmidt.com/about">About</a>
</body>
</html>`;
const inputBaseURL = "https://blog.tanjaschmidt.com";
const actual = getURLsFromHTML(inputHTML, inputBaseURL);
const expected = ["https://blog.tanjaschmidt.com/path", "https://blog.tanjaschmidt.com/about"];
expect(actual).toEqual(expected);
});
test('getURLsFromHTML should return an array of URLs from relative URLs', () => {
const inputHTML =
`<html>
<body>
<a href="/path">Blog</a>
<a href="/about">About</a>
</body>
</html>`;
const inputBaseURL = "https://blog.tanjaschmidt.com";
const actual = getURLsFromHTML(inputHTML, inputBaseURL);
const expected = ["https://blog.tanjaschmidt.com/path", "https://blog.tanjaschmidt.com/about"];
expect(actual).toEqual(expected);
});
test('getURLsFromHTML should not return invalid URLs', () => {
const inputHTML =
`<html>
<body>
<a href="https://blog.tanjaschmidt.com/path">Blog</a>
<a href="https://blog.tanjaschmidt.com/about">About</a>
<a href="hah, not a URL!">No Link</a>
</body>
</html>`;
const inputBaseURL = "https://blog.tanjaschmidt.com";
const actual = getURLsFromHTML(inputHTML, inputBaseURL);
const expected = ["https://blog.tanjaschmidt.com/path", "https://blog.tanjaschmidt.com/about"];
expect(actual).toEqual(expected);
});