Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
haouarihk committed Nov 5, 2023
1 parent f8fc878 commit 9e2a0fc
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions src/extractors/rss-extractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,19 @@ export default class RssExtractor extends Extractor {
return urls;
}



// Extracts URLs ending with .rss from a given text
private extractUrls(text: string): string[] {
// This regex matches URLs that start with 'http://' or 'https://',
// followed by any characters that are not whitespace, and end with
// '.rss', ':feed', 'rss', or 'feeds', at a word boundary, not followed by other URL-valid characters.
const rssUrlRegex = /https?:\/\/[^\s]+?(?:\.rss|:feed|\/rss|\/feeds)\b(?![\w\-\.%])/g;
const matches = text.match(rssUrlRegex);
console.log({
text, matches
})
return matches ? Array.from(new Set(matches)) : [];
}



// Optionally, extract URLs from a given file
Expand All @@ -75,8 +76,8 @@ export default class RssExtractor extends Extractor {
itemText += `Description: ${item.contentSnippet}\n`;
}
if (item.content) {
itemText += `Content: \n ${item.content}\n`;
}
itemText += `Content: \n ${item.content}\n`;
}
itemText += "\n"; // Add a new line after each item for readability
return itemText;
}
Expand Down

0 comments on commit 9e2a0fc

Please sign in to comment.