Skip to content

Commit

Permalink
refactor: Added WaitJob helper.
Browse files Browse the repository at this point in the history
  • Loading branch information
HavenDV committed Sep 8, 2024
1 parent 5a62d9e commit 2f6db18
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 35 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,39 @@ using Firecrawl;

using var api = new FirecrawlApp(apiKey);

// Scrape
var response = await api.Scraping.ScrapeAsync("https://docs.firecrawl.dev/features/scrape");

string markdown = response.Data.Markdown;

// Crawl
var response = await api.Crawling.CrawlUrlsAsync(
url: "https://docs.firecrawl.dev/",
crawlerOptions: new CrawlUrlsRequestCrawlerOptions
{
Limit = 3,
},
pageOptions: new CrawlUrlsRequestPageOptions
{
OnlyMainContent = true,
});

var jobResponse = await api.Crawl.WaitJobAsync(
jobId: response.JobId);

foreach (var data in jobResponse.Data)
{
Console.WriteLine($"URL: {data.Metadata.SourceURL}");
Console.WriteLine($"Output file: {data.Markdown}");
}
```

### CLI
```bash
dotnet tool install -g Firecrawl.Cli
firecrawl auth <API_KEY>
firecrawl scrape https://docs.firecrawl.dev/features/scrape // saves it to output.md
firecrawl crawl https://docs.firecrawl.dev/features/scrape --limit 5 // saves all .md files to docs.firecrawl.dev folder
```

## Support
Expand Down
21 changes: 5 additions & 16 deletions src/libs/Firecrawl.Cli/Commands/CrawlCommand.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.CommandLine;
using System.Diagnostics.CodeAnalysis;

namespace Firecrawl.Cli.Commands;

Expand All @@ -21,7 +20,7 @@ public CrawlCommand() : base(name: "crawl", description: "Crawl a url and saves

var limit = new Option<int>(
name: "limit",
getDefaultValue: () => 10,
getDefaultValue: () => 5,
description: "Limit of pages to crawl");
AddOption(limit);

Expand Down Expand Up @@ -67,18 +66,8 @@ private static async Task HandleAsync(

Console.WriteLine($"JobId: {response.JobId}");

GetCrawlStatusResponse? statusResponse = null;
while (true)
{
await Task.Delay(TimeSpan.FromSeconds(5)).ConfigureAwait(false);

statusResponse = await api.Crawl.GetCrawlStatusAsync(
jobId: response.JobId!).ConfigureAwait(false);
if (statusResponse.Status == "completed")
{
break;
}
}
var jobResponse = await api.Crawl.WaitJobAsync(
jobId: response.JobId!).ConfigureAwait(false);

if (string.IsNullOrWhiteSpace(outputPath))
{
Expand All @@ -88,7 +77,7 @@ private static async Task HandleAsync(
Directory.CreateDirectory(outputPath);

var index = 0;
foreach (var data in statusResponse.Data ?? [])
foreach (var data in jobResponse.Data ?? [])
{
var name = string.IsNullOrWhiteSpace(data.Metadata?.SourceURL)
? $"output{++index}.md"
Expand All @@ -115,7 +104,7 @@ public static string ConvertUrlToFilename(string url)
.Replace("www.", string.Empty, StringComparison.OrdinalIgnoreCase);

// Replace invalid filename characters with '_'
foreach (char c in Path.GetInvalidFileNameChars())
foreach (var c in Path.GetInvalidFileNameChars())
{
url = url.Replace(c, '_');
}
Expand Down
2 changes: 1 addition & 1 deletion src/libs/Firecrawl.Cli/Firecrawl.Cli.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<GenerateDocumentationFile>false</GenerateDocumentationFile>
<NoWarn>$(NoWarn);CA1724;CA1303</NoWarn>
<NoWarn>$(NoWarn);CA1724;CA1303;CA1054;CA1055</NoWarn>
</PropertyGroup>

<PropertyGroup Label="NuGet">
Expand Down
30 changes: 30 additions & 0 deletions src/libs/Firecrawl/CrawlClient.WaitJob.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
namespace Firecrawl;

public partial class CrawlClient
{
/// <summary>
/// Waits for a crawl job to complete or fail.
/// </summary>
/// <param name="jobId"></param>
/// <param name="cancellationToken">The token to cancel the operation with</param>
/// <exception cref="global::System.InvalidOperationException"></exception>
public async Task<GetCrawlStatusResponse> WaitJobAsync(
string jobId,
CancellationToken cancellationToken = default)
{
while (true)
{
cancellationToken.ThrowIfCancellationRequested();

await Task.Delay(TimeSpan.FromSeconds(1), cancellationToken).ConfigureAwait(false);

var statusResponse = await GetCrawlStatusAsync(
jobId: jobId,
cancellationToken: cancellationToken).ConfigureAwait(false);
if (statusResponse.Status is "completed" or "failed")
{
return statusResponse;
}
}
}
}
26 changes: 8 additions & 18 deletions src/tests/IntegrationTests/Tests.Crawl.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,12 @@ public async Task Crawl()

response.JobId.Should().NotBeNullOrEmpty();

GetCrawlStatusResponse? statusResponse = null;
while (!cancellationToken.IsCancellationRequested)
{
await Task.Delay(TimeSpan.FromSeconds(5), cancellationToken);

statusResponse = await api.Crawl.GetCrawlStatusAsync(
jobId: response.JobId!,
cancellationToken: cancellationToken);
if (statusResponse.Status == "completed")
{
break;
}
}
var jobResponse = await api.Crawl.WaitJobAsync(
jobId: response.JobId!,
cancellationToken: cancellationToken);

var index = 0;
foreach (var data in statusResponse?.Data ?? [])
foreach (var data in jobResponse.Data ?? [])
{
data.Html.Should().NotBeNullOrEmpty();
data.Markdown.Should().NotBeNullOrEmpty();
Expand All @@ -51,9 +41,9 @@ public async Task Crawl()
Console.WriteLine($"Output file: {new Uri(fileInfo.FullName).AbsoluteUri}");
}

statusResponse.Should().NotBeNull();
statusResponse!.Status.Should().Be("completed");
statusResponse.Total.Should().Be(3);
statusResponse.Data.Should().NotBeNullOrEmpty();
jobResponse.Should().NotBeNull();
jobResponse.Status.Should().Be("completed");
jobResponse.Total.Should().Be(3);
jobResponse.Data.Should().NotBeNullOrEmpty();
}
}

0 comments on commit 2f6db18

Please sign in to comment.