Skip to content

Commit 5857cd6

Browse files
committed
Initial commit
1 parent 6600cb7 commit 5857cd6

10 files changed

+820
-0
lines changed

ProbToPdf.sln

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio 15
4+
VisualStudioVersion = 15.0.28010.2003
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ProbToPdf", "ProbToPdf\ProbToPdf.csproj", "{22A0B792-A7BD-4AAA-996E-98E57A08BA25}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{22A0B792-A7BD-4AAA-996E-98E57A08BA25}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{22A0B792-A7BD-4AAA-996E-98E57A08BA25}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{22A0B792-A7BD-4AAA-996E-98E57A08BA25}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{22A0B792-A7BD-4AAA-996E-98E57A08BA25}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {E4D26F86-AC00-40E0-9F1E-BB1800E812C6}
24+
EndGlobalSection
25+
EndGlobal

ProbToPdf/Book.cs

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using Newtonsoft.Json;
5+
using Newtonsoft.Json.Linq;
6+
using Serilog;
7+
8+
namespace ProbToPdf
9+
{
10+
class Book
11+
{
12+
public string Title { get; set; }
13+
public string Author { get; set; }
14+
public List<Chapter> Chapters { get; set; }
15+
public List<Page> Other { get; set; }
16+
17+
public Book(string filePath)
18+
{
19+
string json = File.ReadAllText(filePath);
20+
JsonConvert.PopulateObject(json, this);
21+
Process();
22+
}
23+
24+
private void Process()
25+
{
26+
Chapters.ForEach(c => c.Pages.ForEach(p => p.Process()));
27+
Other.ForEach(o => o.Process());
28+
}
29+
30+
public override string ToString()
31+
{
32+
//return $"{Title} by {Author}";
33+
return "book";
34+
}
35+
}
36+
}

ProbToPdf/Chapter.cs

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
5+
namespace ProbToPdf
6+
{
7+
class Chapter
8+
{
9+
public List<Page> Pages { get; set; }
10+
}
11+
}

ProbToPdf/Downloader.cs

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
using Serilog;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.IO;
5+
using System.Linq;
6+
using System.Threading.Tasks;
7+
8+
namespace ProbToPdf
9+
{
10+
class Downloader
11+
{
12+
internal static void Download(Book book)
13+
{
14+
String path = Environment.GetFolderPath(Environment.SpecialFolder.Desktop) + "\\" + book;
15+
16+
Directory.CreateDirectory(path);
17+
File.WriteAllText(path + "\\config.yml", "plugins:\n- mathjax");
18+
19+
book.Other
20+
.Where(p => p.Order < Order.LAST).ToList()
21+
.ForEach(p => WritePage(p, path));
22+
23+
book.Chapters
24+
.ForEach(c => c.Pages
25+
.ForEach(p => WritePage(p, path)));
26+
27+
book.Other
28+
.Where(p => p.Order == Order.LAST).ToList()
29+
.ForEach(p => WritePage(p, path));
30+
}
31+
32+
private static void WritePage(Page p, string path)
33+
{
34+
string page = p.Url.Split('/').Last().Replace(".php", ".html");
35+
Log.Information("Writing to disk: " + page);
36+
File.WriteAllTextAsync(path + "\\" + page, p.Content);
37+
}
38+
}
39+
}

ProbToPdf/PDFGenerator.cs

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
using Serilog;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Management.Automation;
6+
7+
namespace ProbToPdf
8+
{
9+
class PDFGenerator
10+
{
11+
internal static void Generate(Book book)
12+
{
13+
String path = Environment.GetFolderPath(Environment.SpecialFolder.Desktop) + "\\" + book;
14+
15+
List<Page> pages = new List<Page>();
16+
book.Chapters.ForEach(c => pages.AddRange(c.Pages));
17+
pages.AddRange(book.Other);
18+
19+
List<string> files = pages.Select(p => path + "\\" + p.Url.Split('/').Last().Replace(".php", ".html")).ToList();
20+
21+
files.ForEach(f => Execute($"relaxed '{f}' --bo"));
22+
23+
Execute($"pdfunite {String.Join(" ", files.Select(f => f.Replace(".html", ".pdf")))} output.pdf");
24+
}
25+
26+
private static void Execute(string command)
27+
{
28+
Log.Information("Executing: " + command);
29+
using (var ps = PowerShell.Create())
30+
{
31+
var results = ps.AddScript(command.Replace("'", "''")).Invoke();
32+
foreach (var result in results)
33+
{
34+
Log.Information(result.ToString());
35+
}
36+
}
37+
}
38+
}
39+
}

ProbToPdf/Page.cs

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
using HtmlAgilityPack;
2+
using Serilog;
3+
using System;
4+
using System.Collections.Generic;
5+
using System.IO;
6+
using System.Linq;
7+
using System.Text;
8+
using System.Threading;
9+
10+
namespace ProbToPdf
11+
{
12+
class Page
13+
{
14+
public string Url { get; set; }
15+
public string Content { get; set; }
16+
public Order Order { get; set; } = Order.DEFAULT;
17+
18+
public void Process()
19+
{
20+
Content = GetPage(Url);
21+
}
22+
23+
private string GetPage(string url)
24+
{
25+
Log.Information("Parsing: " + url);
26+
HtmlDocument html = DownloadHtml(url);
27+
28+
string result = ParseHtml(html).InnerHtml;
29+
30+
result = FixInlineMath(result);
31+
result = AddStyling(result);
32+
33+
//return AddStyling(FixInlineMath(html.OuterHtml)).Trim();
34+
return result;
35+
}
36+
37+
private static HtmlDocument DownloadHtml(string url)
38+
{
39+
var web = new HtmlWeb();
40+
var html = web.Load(url);
41+
42+
while (web.StatusCode != System.Net.HttpStatusCode.OK || html.DocumentNode.InnerLength < 100)
43+
{
44+
Log.Warning("Url: " + url + ", Statuscode: " + web.StatusCode);
45+
Thread.Sleep(1000);
46+
html = web.Load(url);
47+
}
48+
49+
return html;
50+
}
51+
52+
private HtmlNode ParseHtml(HtmlDocument html)
53+
{
54+
// TODO: Make parsing dynamic, fx. from a json file
55+
HtmlNode node = html.DocumentNode.SelectSingleNode("//*[@id=\"content\"]");
56+
57+
// Remove unwanted html elements
58+
List<string> xpaths = new List<string>
59+
{
60+
"//div[contains(@class,'thinblock')]", // next- and previuos page arrows
61+
"//div[contains(@class,'hide_print')]/a/span", // "Video available"
62+
"//script", // <script> tags
63+
"//comment()" // comments
64+
};
65+
xpaths.ForEach(x => node.SelectNodes(x)?.ToList().ForEach(n => n.Remove()));
66+
67+
// Fix links
68+
FixHrefs(node);
69+
FixSrcs(node);
70+
71+
return node;
72+
}
73+
74+
private static void FixHrefs(HtmlNode node)
75+
{
76+
var hrefs = node.SelectNodes("//*[@href]");
77+
if (hrefs == null || hrefs.Count < 1) return;
78+
79+
foreach (var href in hrefs)
80+
{
81+
string newHref = href.Attributes["href"].Value;
82+
newHref = newHref.StartsWith("//") ? newHref.Substring(2) : newHref;
83+
newHref = newHref.StartsWith("www") ? "https://" + newHref : newHref;
84+
href.Attributes["href"].Value = newHref;
85+
}
86+
}
87+
88+
private static void FixSrcs(HtmlNode node)
89+
{
90+
var srcs = node.SelectNodes("//*[@src]");
91+
if (srcs == null || srcs.Count < 1) return;
92+
93+
foreach (var src in srcs)
94+
{
95+
string newSrc = src.Attributes["src"].Value;
96+
if (newSrc == "//icons.iconarchive.com/icons/bokehlicia/captiva/32/web-google-youtube-icon.png")
97+
{
98+
newSrc = "https://cdn1.iconfinder.com/data/icons/google_jfk_icons_by_carlosjj/512/youtube.png";
99+
string style = src.Attributes["style"].Value;
100+
src.Attributes["style"].Value = style + "height: 50px;";
101+
}
102+
newSrc = (!newSrc.StartsWith("http") && !newSrc.StartsWith("www")) ? "https://www.probabilitycourse.com/" + newSrc : newSrc;
103+
src.Attributes["src"].Value = newSrc;
104+
}
105+
}
106+
107+
// Change '$ *equation* $' to '\\( *equation* \\)'
108+
private static string FixInlineMath(string content)
109+
{
110+
int counter = 0;
111+
StringBuilder sb = new StringBuilder();
112+
for (int i = 0; i < content.Length; i++)
113+
{
114+
if (!content[i].Equals('$'))
115+
{
116+
sb.Append(content[i]);
117+
}
118+
else
119+
{
120+
if (content[i + 1] == '$')
121+
{
122+
counter += 2;
123+
}
124+
else
125+
{
126+
if (counter % 2 == 0)
127+
{
128+
sb.Append("\\(");
129+
}
130+
else
131+
{
132+
sb.Append("\\)");
133+
}
134+
counter++;
135+
}
136+
}
137+
}
138+
return sb.ToString();
139+
}
140+
141+
private static string AddStyling(string content)
142+
{
143+
string styling = "<link href=\"https://www.probabilitycourse.com/style_sheet.css\" type=\"text/css\" rel=\"stylesheet\" />\n";
144+
styling += "<style>\n";
145+
styling += File.ReadAllText("style.css");
146+
styling += "\n</style>";
147+
148+
return content + styling;
149+
}
150+
}
151+
152+
enum Order
153+
{
154+
DEFAULT, FIRST, LAST
155+
}
156+
}

ProbToPdf/ProbToPdf.csproj

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<PackageReference Include="Fizzler.Systems.HtmlAgilityPack" Version="1.2.0" />
10+
<PackageReference Include="HtmlAgilityPack" Version="1.8.9" />
11+
<PackageReference Include="microsoft.powershell.commands.diagnostics" Version="6.0.0-rc" />
12+
<PackageReference Include="microsoft.powershell.sdk" Version="6.0.0-rc" />
13+
<PackageReference Include="microsoft.wsman.management" Version="6.0.0-rc" />
14+
<PackageReference Include="Serilog.Sinks.Console" Version="3.1.1" />
15+
</ItemGroup>
16+
17+
<ItemGroup>
18+
<None Update="book.json">
19+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
20+
</None>
21+
<None Update="style.css">
22+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
23+
</None>
24+
</ItemGroup>
25+
26+
</Project>

0 commit comments

Comments
 (0)