|
| 1 | +using HtmlAgilityPack; |
| 2 | +using Serilog; |
| 3 | +using System; |
| 4 | +using System.Collections.Generic; |
| 5 | +using System.IO; |
| 6 | +using System.Linq; |
| 7 | +using System.Text; |
| 8 | +using System.Threading; |
| 9 | + |
| 10 | +namespace ProbToPdf |
| 11 | +{ |
| 12 | + class Page |
| 13 | + { |
| 14 | + public string Url { get; set; } |
| 15 | + public string Content { get; set; } |
| 16 | + public Order Order { get; set; } = Order.DEFAULT; |
| 17 | + |
| 18 | + public void Process() |
| 19 | + { |
| 20 | + Content = GetPage(Url); |
| 21 | + } |
| 22 | + |
| 23 | + private string GetPage(string url) |
| 24 | + { |
| 25 | + Log.Information("Parsing: " + url); |
| 26 | + HtmlDocument html = DownloadHtml(url); |
| 27 | + |
| 28 | + string result = ParseHtml(html).InnerHtml; |
| 29 | + |
| 30 | + result = FixInlineMath(result); |
| 31 | + result = AddStyling(result); |
| 32 | + |
| 33 | + //return AddStyling(FixInlineMath(html.OuterHtml)).Trim(); |
| 34 | + return result; |
| 35 | + } |
| 36 | + |
| 37 | + private static HtmlDocument DownloadHtml(string url) |
| 38 | + { |
| 39 | + var web = new HtmlWeb(); |
| 40 | + var html = web.Load(url); |
| 41 | + |
| 42 | + while (web.StatusCode != System.Net.HttpStatusCode.OK || html.DocumentNode.InnerLength < 100) |
| 43 | + { |
| 44 | + Log.Warning("Url: " + url + ", Statuscode: " + web.StatusCode); |
| 45 | + Thread.Sleep(1000); |
| 46 | + html = web.Load(url); |
| 47 | + } |
| 48 | + |
| 49 | + return html; |
| 50 | + } |
| 51 | + |
| 52 | + private HtmlNode ParseHtml(HtmlDocument html) |
| 53 | + { |
| 54 | + // TODO: Make parsing dynamic, fx. from a json file |
| 55 | + HtmlNode node = html.DocumentNode.SelectSingleNode("//*[@id=\"content\"]"); |
| 56 | + |
| 57 | + // Remove unwanted html elements |
| 58 | + List<string> xpaths = new List<string> |
| 59 | + { |
| 60 | + "//div[contains(@class,'thinblock')]", // next- and previuos page arrows |
| 61 | + "//div[contains(@class,'hide_print')]/a/span", // "Video available" |
| 62 | + "//script", // <script> tags |
| 63 | + "//comment()" // comments |
| 64 | + }; |
| 65 | + xpaths.ForEach(x => node.SelectNodes(x)?.ToList().ForEach(n => n.Remove())); |
| 66 | + |
| 67 | + // Fix links |
| 68 | + FixHrefs(node); |
| 69 | + FixSrcs(node); |
| 70 | + |
| 71 | + return node; |
| 72 | + } |
| 73 | + |
| 74 | + private static void FixHrefs(HtmlNode node) |
| 75 | + { |
| 76 | + var hrefs = node.SelectNodes("//*[@href]"); |
| 77 | + if (hrefs == null || hrefs.Count < 1) return; |
| 78 | + |
| 79 | + foreach (var href in hrefs) |
| 80 | + { |
| 81 | + string newHref = href.Attributes["href"].Value; |
| 82 | + newHref = newHref.StartsWith("//") ? newHref.Substring(2) : newHref; |
| 83 | + newHref = newHref.StartsWith("www") ? "https://" + newHref : newHref; |
| 84 | + href.Attributes["href"].Value = newHref; |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + private static void FixSrcs(HtmlNode node) |
| 89 | + { |
| 90 | + var srcs = node.SelectNodes("//*[@src]"); |
| 91 | + if (srcs == null || srcs.Count < 1) return; |
| 92 | + |
| 93 | + foreach (var src in srcs) |
| 94 | + { |
| 95 | + string newSrc = src.Attributes["src"].Value; |
| 96 | + if (newSrc == "//icons.iconarchive.com/icons/bokehlicia/captiva/32/web-google-youtube-icon.png") |
| 97 | + { |
| 98 | + newSrc = "https://cdn1.iconfinder.com/data/icons/google_jfk_icons_by_carlosjj/512/youtube.png"; |
| 99 | + string style = src.Attributes["style"].Value; |
| 100 | + src.Attributes["style"].Value = style + "height: 50px;"; |
| 101 | + } |
| 102 | + newSrc = (!newSrc.StartsWith("http") && !newSrc.StartsWith("www")) ? "https://www.probabilitycourse.com/" + newSrc : newSrc; |
| 103 | + src.Attributes["src"].Value = newSrc; |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + // Change '$ *equation* $' to '\\( *equation* \\)' |
| 108 | + private static string FixInlineMath(string content) |
| 109 | + { |
| 110 | + int counter = 0; |
| 111 | + StringBuilder sb = new StringBuilder(); |
| 112 | + for (int i = 0; i < content.Length; i++) |
| 113 | + { |
| 114 | + if (!content[i].Equals('$')) |
| 115 | + { |
| 116 | + sb.Append(content[i]); |
| 117 | + } |
| 118 | + else |
| 119 | + { |
| 120 | + if (content[i + 1] == '$') |
| 121 | + { |
| 122 | + counter += 2; |
| 123 | + } |
| 124 | + else |
| 125 | + { |
| 126 | + if (counter % 2 == 0) |
| 127 | + { |
| 128 | + sb.Append("\\("); |
| 129 | + } |
| 130 | + else |
| 131 | + { |
| 132 | + sb.Append("\\)"); |
| 133 | + } |
| 134 | + counter++; |
| 135 | + } |
| 136 | + } |
| 137 | + } |
| 138 | + return sb.ToString(); |
| 139 | + } |
| 140 | + |
| 141 | + private static string AddStyling(string content) |
| 142 | + { |
| 143 | + string styling = "<link href=\"https://www.probabilitycourse.com/style_sheet.css\" type=\"text/css\" rel=\"stylesheet\" />\n"; |
| 144 | + styling += "<style>\n"; |
| 145 | + styling += File.ReadAllText("style.css"); |
| 146 | + styling += "\n</style>"; |
| 147 | + |
| 148 | + return content + styling; |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + enum Order |
| 153 | + { |
| 154 | + DEFAULT, FIRST, LAST |
| 155 | + } |
| 156 | +} |
0 commit comments