forked from atauenis/webone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
WebArchiveRequest.cs
86 lines (76 loc) · 2.71 KB
/
WebArchiveRequest.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
using System;
using System.Linq;
using System.Net.Http;
using System.IO;
namespace WebOne
{
/// <summary>
/// Request to a Internet Archive Wayback Machine CDX server for archived copy of website
/// </summary>
class WebArchiveRequest
{
//Documentation:
//https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
//https://github.com/atauenis/webone/wiki/Wayback-Machine
//https://archive.org/help/wayback_api.php
/// <summary>
/// Check Wayback Machine for archived copy of Web page at <paramref name="URL"/>
/// </summary>
/// <param name="URL">Address of original Web page</param>
public WebArchiveRequest(string URL)
{
string CdxUrl = string.Format(
"https://web.archive.org/cdx/search/cdx?fl={0}&url={1}",
"timestamp,original,statuscode", //fields: ["urlkey","timestamp","original","mimetype","statuscode","digest","length"]
Uri.EscapeDataString(URL));
const int CdxFieldsCount = 3;
//send request to CDX server
var CdxResponse = new HttpClient().Send(new HttpRequestMessage(HttpMethod.Get,new Uri(CdxUrl)));
if (!CdxResponse.IsSuccessStatusCode) throw new Exception("Unsuccessful Web Archive request: " + CdxResponse.ReasonPhrase ?? " without reason");
string[] CdxBody = new StreamReader(CdxResponse.Content.ReadAsStream()).ReadToEnd().TrimEnd().Split('\n');
if(CdxBody.Length == 0){
//not archived
Archived = false;
ArchivedURL = "";
return;
}
if(CdxBody[0] == string.Empty){
//not archived too
Archived = false;
ArchivedURL = "";
return;
}
//find last (or last at ArchiveDateLimit date) archived version, preferable without redirects
string LastCdxEntry = "";
foreach (var CdxEntry in CdxBody)
{
string[] Fields = CdxEntry.Split(" ");
if (Fields.Count() != CdxFieldsCount) continue;
if(ConfigFile.ArchiveDateLimit > 0)
{
long.TryParse(Fields[0], out long Timestamp);
if (Timestamp > ConfigFile.ArchiveDateLimit * (Math.Pow(10, 6))) continue;
}
if (Fields[2] == "200") LastCdxEntry = CdxEntry;
}
if (LastCdxEntry == "") LastCdxEntry = CdxBody[^1];
string[] ResultFields = LastCdxEntry.Split(" ");
if(ResultFields.Count() != CdxFieldsCount){
//bad CDX syntax
Archived = false;
ArchivedURL = "";
throw new Exception("Incorrect Web Archive request: " + LastCdxEntry);
}
Archived = true;
ArchivedURL = string.Format("http://web.archive.org/web/{0}/{1}", ResultFields[0], ResultFields[1]);
}
/// <summary>
/// Is the requested URL archived by Wayback Machine
/// </summary>
public bool Archived { get; private set; }
/// <summary>
/// Address of archived copy of requested URL
/// </summary>
public string ArchivedURL { get; private set; }
}
}