Skip to content

Commit

Permalink
fix bookru
Browse files Browse the repository at this point in the history
  • Loading branch information
Oleg Koloskov committed Nov 3, 2023
1 parent 4c7afde commit b9f91ae
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 43 deletions.
88 changes: 55 additions & 33 deletions Parsers/BookRu.Parser/Logic/Parser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using BookRu.Parser.Types.API;
using BookRu.Parser.Types.API.Book;
using BookRu.Parser.Types.API.Categories;
using BookRu.Parser.Types.API.Sidebar;
using Core.Extensions;
using Core.Providers.Interfaces;
using Core.Types;
using HtmlAgilityPack;
using Newtonsoft.Json;
using Parser.Core.Configs;
using Parser.Core.Extensions;
using Parser.Core.Logic;

namespace BookRu.Parser.Logic {
Expand All @@ -24,11 +23,14 @@ public Parser(IParserConfigBase config, IRepository<BookInfo> provider) : base(c
protected override string ElsName => "BookRu";

protected override async Task<IDataflowBlock[]> RunInternal(HttpClient client, ISet<string> processed) {
var getBookIdsBlock = new TransformBlock<MenuItem, IEnumerable<string>>(async categoryId => await GetBookIds(client, categoryId));
var nextData = await GetNextData(client);
var buildId = nextData.RootElement.GetProperty("buildId").GetString();

var getBookIdsBlock = new TransformBlock<MenuItem, IEnumerable<string>>(async categoryId => await GetBookIds(client, buildId, categoryId));
getBookIdsBlock.CompleteMessage(_logger, "Получение каталогов книг закончено. Ждем получения книг.");

var filterBlock = new TransformManyBlock<IEnumerable<string>, string>(bookIds => Filter(bookIds, processed));
var getBooksBlock = new TransformBlock<string, BookInfo>(async bookId => await GetBook(client, bookId), GetParserOptions());
var getBooksBlock = new TransformBlock<string, BookInfo>(async bookId => await GetBook(client, buildId, bookId), GetParserOptions());
getBooksBlock.CompleteMessage(_logger, "Получение книг закончено. Ждем сохранения.");

var batchBlock = new BatchBlock<BookInfo>(_config.BatchSize);
Expand All @@ -40,49 +42,69 @@ protected override async Task<IDataflowBlock[]> RunInternal(HttpClient client, I
getBooksBlock.LinkTo(batchBlock);
batchBlock.LinkTo(saveBookBlock);

foreach (var categoryId in await GetCategoryIds(client)) {
foreach (var categoryId in GetCategoryIds(nextData)) {
await getBookIdsBlock.SendAsync(categoryId);
}

return new IDataflowBlock[] {getBookIdsBlock, filterBlock, getBooksBlock, batchBlock, saveBookBlock};
}

private async Task<JsonDocument> GetNextData(HttpClient client) {
var response = await client.GetStringAsync(new Uri("https://www.book.ru/book"));
var doc = new HtmlDocument();
doc.LoadHtml(response);

var json = doc.GetElementbyId("__NEXT_DATA__").InnerText;
return JsonDocument.Parse(json);
}

private static IEnumerable<string> Filter(IEnumerable<string> bookIds, ISet<string> processed) {
return bookIds.Where(processed.Add);
}

private static async Task<IEnumerable<MenuItem>> GetCategoryIds(HttpClient client) {
var sidebar = await client.GetJson<ApiResponse<Sidebar>>(new Uri("https://www.book.ru/cat/get_sidebar"));
return sidebar?.Data.Content.SelectMany(t => t.Value);
private static IEnumerable<MenuItem> GetCategoryIds(JsonDocument nextData) {
var sidebar = JsonConvert.DeserializeObject<Sidebar[]>(nextData
.RootElement.GetProperty("props")
.GetProperty("pageProps")
.GetProperty("serverMenu")
.GetRawText());

return sidebar.SelectMany(s => s.Menu);
}

private static async Task<IEnumerable<string>> GetBookIds(HttpClient client, MenuItem menuItem) {
var data = new {
cat_id = menuItem.Id,
as_view = 3,
years = Array.Empty<string>()
};

_logger.Info($"Получаем каталог с ID = {menuItem.Id}, Name = {menuItem.Text}");
private static async Task<IEnumerable<string>> GetBookIds(HttpClient client, string buildId, MenuItem menuItem) {
if (menuItem.Id == "new") {
return Enumerable.Empty<string>();
}

_logger.Info($"Получаем каталог с ID = {menuItem.Id}, Name = {menuItem.Name}");

var response = await client.GetStringAsync(new Uri($"https://book.ru/_next/data/{buildId}/cat/{menuItem.Id}.json"));
var json = JsonDocument.Parse(response);

var content = new StringContent(JsonConvert.SerializeObject(data), Encoding.UTF8, "application/json");
var response = await client.PostJson<ApiResponse<CategoryContent>>(new Uri("https://www.book.ru/cat/get_categories"), content);
return response?.Data?.Content?.Select(t => t.Key) ?? Enumerable.Empty<string>();
return JsonConvert.DeserializeObject<CategoryContent>(json.RootElement.GetProperty("pageProps").GetProperty("allServerData").GetRawText()).Item.Select(i => i.Id.ToString());
}

private async Task<BookInfo> GetBook(HttpClient client, string id) {
var response = await client.GetJson<ApiResponse<Dictionary<string, BookItem>>>(new Uri($"https://www.book.ru/book/get_book/{id}"));
return response?.Data == default || !response.Data.TryGetValue(id, out var book)
? default
: new BookInfo(id, ElsName) {
Authors = book.Author,
Bib = book.Bib,
ISBN = book.ISBN,
Name = book.Name,
Pages = book.Pages ?? 0,
Year = book.Year,
Publisher = book.Publisher
};
private async Task<BookInfo> GetBook(HttpClient client, string buildId, string id) {
var response = await client.GetStringAsync(new Uri($"https://book.ru/_next/data/{buildId}/book/{id}.json?"));
var json = JsonDocument.Parse(response);

var items = JsonConvert.DeserializeObject<BookItem[]>(json.RootElement.GetProperty("pageProps").GetProperty("serverDataBook").GetProperty("item").GetRawText());

if (items.Length == 0) {
return default;
}

var book = items[0];
return new BookInfo(id, ElsName) {
Authors = book.Author,
Bib = book.BiblioDesc,
ISBN = book.ISBN,
Name = book.Name,
Pages = book.Pages ?? 0,
Year = book.Year,
Publisher = book.Publisher
};
}
}
}
5 changes: 2 additions & 3 deletions Parsers/BookRu.Parser/Types/API/Book/BookItem.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@

namespace BookRu.Parser.Types.API.Book {
public class BookItem {
[JsonProperty("biblio_desc_2")]
public string Bib;
[JsonProperty("biblio_desc")]
public string BiblioDesc;
public string Author;
public int? Pages;
[JsonProperty("year_norm")]
public string Year;
[JsonProperty("pub_name")]
public string Publisher;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
using System.Collections.Generic;

namespace BookRu.Parser.Types.API.Categories {
public class CategoryContent {
public Dictionary<string, object> Content = new();
public CategoryItem[] Item { get; set; }
}
}
5 changes: 5 additions & 0 deletions Parsers/BookRu.Parser/Types/API/Categories/CategoryItem.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
namespace BookRu.Parser.Types.API.Categories;

public class CategoryItem {
public long Id { get; set; }
}
4 changes: 2 additions & 2 deletions Parsers/BookRu.Parser/Types/API/Sidebar/MenuItem.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace BookRu.Parser.Types.API.Sidebar {
public class MenuItem {
public long Id;
public string Text;
public string Id { get; set; }
public string Name { get; set; }
}
}
3 changes: 1 addition & 2 deletions Parsers/BookRu.Parser/Types/API/Sidebar/Sidebar.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

namespace BookRu.Parser.Types.API.Sidebar {
public class Sidebar {
[JsonProperty("type_id_content")]
public Dictionary<long, MenuItem[]> Content = new();
public MenuItem[] Menu { get; set; }
}
}

0 comments on commit b9f91ae

Please sign in to comment.