From cc828d816868e1901303dc4d59f20543f1cc0522 Mon Sep 17 00:00:00 2001 From: Robbie Groenewoudt Date: Sun, 22 May 2016 20:43:28 +0200 Subject: [PATCH] RoyalRoadL source improvements: - Remove donation blocks - Remove chapter-links-table added by author - Remove too many linebreaks --- WebNovelConverter/Sources/RoyalRoadLSource.cs | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/WebNovelConverter/Sources/RoyalRoadLSource.cs b/WebNovelConverter/Sources/RoyalRoadLSource.cs index d5c16b2..bdd5d94 100644 --- a/WebNovelConverter/Sources/RoyalRoadLSource.cs +++ b/WebNovelConverter/Sources/RoyalRoadLSource.cs @@ -8,6 +8,7 @@ using AngleSharp.Dom.Html; using AngleSharp.Extensions; using WebNovelConverter.Sources.Models; +using System.Text.RegularExpressions; namespace WebNovelConverter.Sources { @@ -65,23 +66,27 @@ public override async Task GetChapterAsync(ChapterLink link, IHtmlDocument doc = await Parser.ParseAsync(pageContent, token); - IElement firstPostElement = (from e in doc.All + IElement postBodyEl = (from e in doc.All where e.LocalName == "div" where e.HasAttribute("class") let classAttribute = e.GetAttribute("class") where classAttribute.Contains("post_body") select e).FirstOrDefault(); - if (firstPostElement == null) + if (postBodyEl == null) return null; - RemoveNavigation(firstPostElement); - ExpandSpoilers(firstPostElement); + RemoveNavigation(postBodyEl); + RemoveDonation(postBodyEl); + ExpandSpoilers(postBodyEl); + RemoveEmpyTags(postBodyEl); + + var content = CleanupHTML(postBodyEl.InnerHtml); return new WebNovelChapter { Url = link.Url, - Content = firstPostElement.InnerHtml + Content = content }; } @@ -104,7 +109,23 @@ where classAttribute.Contains("post_body") protected virtual void RemoveNavigation(IElement rootElement) { - rootElement.Descendents().LastOrDefault(p => p.LocalName == "table")?.Remove(); + // Last 1-2 tables might be navigation + + foreach(var table in rootElement.QuerySelectorAll("table").Reverse().Take(2)) + { + if( table.QuerySelectorAll("a").Any(x => x.TextContent.Contains("Chapter"))) { + table.Remove(); + } + } + } + + protected virtual void RemoveDonation(IElement rootElement) + { + foreach (var el in rootElement.QuerySelectorAll("div.thead")) + { + if (el.TextContent.Contains("Donation for the Author")) + el.Remove(); + } } /// @@ -128,5 +149,24 @@ protected void ExpandSpoilers(IElement rootElement) el.Remove(); } } + + private void RemoveEmpyTags(IElement rootElement) + { + foreach (var el in rootElement.QuerySelectorAll("div,span")) + { + if (string.IsNullOrWhiteSpace(el.TextContent) && el.ChildElementCount == 0) + { + el.Remove(); + } + } + } + + private string CleanupHTML(string html) + { + // Too many newlines sometimes + html = new Regex("(
\\s*){3,}").Replace(html, "

"); + + return html.Trim(); + } } }