Skip to content

Commit

Permalink
fix some a tag can't be fix domain
Browse files Browse the repository at this point in the history
  • Loading branch information
Lewis authored and Lewis committed Mar 6, 2019
1 parent df2f8b4 commit cfd674f
Showing 1 changed file with 46 additions and 34 deletions.
80 changes: 46 additions & 34 deletions src/DotnetSpider.Extraction/Selectable.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class Selectable : AbstractSelectable
/// <param name="removeOutboundLinks">是否去除外链</param>
public Selectable(string html, string url, bool removeOutboundLinks = true)
{
HtmlDocument document = new HtmlDocument { OptionAutoCloseOnEnd = true };
HtmlDocument document = new HtmlDocument {OptionAutoCloseOnEnd = true};
document.LoadHtml(html);

if (!string.IsNullOrWhiteSpace(url))
Expand All @@ -32,7 +32,8 @@ public Selectable(string html, string url, bool removeOutboundLinks = true)
RemoveOutboundLinks(document, domain);
}
}
Elements = new List<dynamic> { document.DocumentNode.OuterHtml };

Elements = new List<dynamic> {document.DocumentNode.OuterHtml};
}

/// <summary>
Expand All @@ -41,7 +42,7 @@ public Selectable(string html, string url, bool removeOutboundLinks = true)
/// <param name="json">Json</param>
public Selectable(string json)
{
Elements = new List<dynamic> { json };
Elements = new List<dynamic> {json};
}

/// <summary>
Expand Down Expand Up @@ -86,29 +87,29 @@ public override dynamic Environment(string field)
switch (key)
{
case "now":
{
return DateTime.Now.ToString("yyyy/MM/dd hh:mm:ss");
}
{
return DateTime.Now.ToString("yyyy/MM/dd hh:mm:ss");
}
case "monday":
{
var now = DateTime.Now;
int i = now.DayOfWeek - DayOfWeek.Monday == -1 ? 6 : -1;
TimeSpan ts = new TimeSpan(i, 0, 0, 0);
return now.Subtract(ts).Date.ToString("yyyy/MM/dd hh:mm:ss");
}
{
var now = DateTime.Now;
int i = now.DayOfWeek - DayOfWeek.Monday == -1 ? 6 : -1;
TimeSpan ts = new TimeSpan(i, 0, 0, 0);
return now.Subtract(ts).Date.ToString("yyyy/MM/dd hh:mm:ss");
}
case "today":
{
return DateTime.Now.Date.ToString("yyyy/MM/dd hh:mm:ss");
}
{
return DateTime.Now.Date.ToString("yyyy/MM/dd hh:mm:ss");
}
case "monthly":
{
var now = DateTime.Now;
return now.AddDays(now.Day * -1 + 1).ToString("yyyy/MM/dd hh:mm:ss");
}
{
var now = DateTime.Now;
return now.AddDays(now.Day * -1 + 1).ToString("yyyy/MM/dd hh:mm:ss");
}
default:
{
return Properties.ContainsKey(field) ? Properties[field] : null;
}
{
return Properties.ContainsKey(field) ? Properties[field] : null;
}
}
}

Expand All @@ -128,13 +129,15 @@ public override ISelectable Links()
results.Add(link);
}
}

foreach (var link in sourceLinks)
{
if (Uri.TryCreate(link, UriKind.RelativeOrAbsolute, out _))
{
results.Add(link);
}
}

return new Selectable(results.ToList());
}

Expand Down Expand Up @@ -166,8 +169,10 @@ public override ISelectable Select(ISelector selector)
results.Add(result);
}
}

return new Selectable(results);
}

throw new ExtractionException($"{nameof(selector)} is null.");
}

Expand All @@ -189,6 +194,7 @@ public override ISelectable SelectList(ISelector selector)
results.AddRange(result);
}
}

return new Selectable(results);
}

Expand All @@ -204,8 +210,9 @@ public override IEnumerable<ISelectable> Nodes()
List<ISelectable> result = new List<ISelectable>();
foreach (var element in Elements)
{
result.Add(new Selectable(new List<dynamic>() { element }));
result.Add(new Selectable(new List<dynamic>() {element}));
}

return result;
}

Expand Down Expand Up @@ -242,26 +249,28 @@ public static string CanonicalizeUrl(string url, string refer)

private void FixAllRelativeHref(HtmlDocument document, string url)
{
var nodes = document.DocumentNode.SelectNodes("//a[not(starts-with(@href,'http') or starts-with(@href,'https'))]");
if (nodes != null)
var hrefNodes = document.DocumentNode.SelectNodes(".//@href");
if (hrefNodes != null)
{
foreach (var node in nodes)
foreach (var node in hrefNodes)
{
if (node.Attributes["href"] != null)
var href = node.Attributes["href"].Value;
if (!string.IsNullOrWhiteSpace(href) && !href.Contains("http") && !href.Contains("https"))
{
node.Attributes["href"].Value = CanonicalizeUrl(node.Attributes["href"].Value, url);
node.Attributes["href"].Value = CanonicalizeUrl(href, url);
}
}
}

var images = document.DocumentNode.SelectNodes(".//img");
if (images != null)
var srcNodes = document.DocumentNode.SelectNodes(".//@src");
if (srcNodes != null)
{
foreach (var image in images)
foreach (var node in srcNodes)
{
if (image.Attributes["src"] != null)
var src = node.Attributes["src"].Value;
if (!string.IsNullOrWhiteSpace(src) && !src.Contains("http") && !src.Contains("https"))
{
image.Attributes["src"].Value = CanonicalizeUrl(image.Attributes["src"].Value, url);
node.Attributes["src"].Value = CanonicalizeUrl(src, url);
}
}
}
Expand All @@ -279,17 +288,20 @@ private void RemoveOutboundLinks(HtmlDocument document, params string[] domains)
foreach (var domain in domains)
{
var href = node.Attributes["href"]?.Value;
if (!string.IsNullOrWhiteSpace(href) && System.Text.RegularExpressions.Regex.IsMatch(href, domain))
if (!string.IsNullOrWhiteSpace(href) &&
System.Text.RegularExpressions.Regex.IsMatch(href, domain))
{
isMatch = true;
break;
}
}

if (!isMatch)
{
deleteNodes.Add(node);
}
}

foreach (var node in deleteNodes)
{
node.Remove();
Expand Down

0 comments on commit cfd674f

Please sign in to comment.