Skip to content

Commit

Permalink
RoyalRoadL source improvements:
Browse files Browse the repository at this point in the history
- Remove donation blocks
- Remove chapter-links-table added by author
- Remove too many linebreaks
  • Loading branch information
Robbie Groenewoudt committed May 22, 2016
1 parent 4ab668c commit cc828d8
Showing 1 changed file with 46 additions and 6 deletions.
52 changes: 46 additions & 6 deletions WebNovelConverter/Sources/RoyalRoadLSource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using AngleSharp.Dom.Html;
using AngleSharp.Extensions;
using WebNovelConverter.Sources.Models;
using System.Text.RegularExpressions;

namespace WebNovelConverter.Sources
{
Expand Down Expand Up @@ -65,23 +66,27 @@ public override async Task<WebNovelChapter> GetChapterAsync(ChapterLink link,

IHtmlDocument doc = await Parser.ParseAsync(pageContent, token);

IElement firstPostElement = (from e in doc.All
IElement postBodyEl = (from e in doc.All
where e.LocalName == "div"
where e.HasAttribute("class")
let classAttribute = e.GetAttribute("class")
where classAttribute.Contains("post_body")
select e).FirstOrDefault();

if (firstPostElement == null)
if (postBodyEl == null)
return null;

RemoveNavigation(firstPostElement);
ExpandSpoilers(firstPostElement);
RemoveNavigation(postBodyEl);
RemoveDonation(postBodyEl);
ExpandSpoilers(postBodyEl);
RemoveEmpyTags(postBodyEl);

var content = CleanupHTML(postBodyEl.InnerHtml);

return new WebNovelChapter
{
Url = link.Url,
Content = firstPostElement.InnerHtml
Content = content
};
}

Expand All @@ -104,7 +109,23 @@ where classAttribute.Contains("post_body")

protected virtual void RemoveNavigation(IElement rootElement)
{
rootElement.Descendents<IElement>().LastOrDefault(p => p.LocalName == "table")?.Remove();
// Last 1-2 tables might be navigation

foreach(var table in rootElement.QuerySelectorAll("table").Reverse().Take(2))
{
if( table.QuerySelectorAll("a").Any(x => x.TextContent.Contains("Chapter"))) {
table.Remove();
}
}
}

protected virtual void RemoveDonation(IElement rootElement)
{
foreach (var el in rootElement.QuerySelectorAll("div.thead"))
{
if (el.TextContent.Contains("Donation for the Author"))
el.Remove();
}
}

/// <summary>
Expand All @@ -128,5 +149,24 @@ protected void ExpandSpoilers(IElement rootElement)
el.Remove();
}
}

private void RemoveEmpyTags(IElement rootElement)
{
foreach (var el in rootElement.QuerySelectorAll("div,span"))
{
if (string.IsNullOrWhiteSpace(el.TextContent) && el.ChildElementCount == 0)
{
el.Remove();
}
}
}

private string CleanupHTML(string html)
{
// Too many newlines sometimes
html = new Regex("(<br>\\s*){3,}").Replace(html, "<br /><br />");

return html.Trim();
}
}
}

0 comments on commit cc828d8

Please sign in to comment.