diff --git a/Parsers/BookRu.Parser/Logic/Parser.cs b/Parsers/BookRu.Parser/Logic/Parser.cs index ca5d480..6120bbc 100644 --- a/Parsers/BookRu.Parser/Logic/Parser.cs +++ b/Parsers/BookRu.Parser/Logic/Parser.cs @@ -2,19 +2,18 @@ using System.Collections.Generic; using System.Linq; using System.Net.Http; -using System.Text; +using System.Text.Json; using System.Threading.Tasks; using System.Threading.Tasks.Dataflow; -using BookRu.Parser.Types.API; using BookRu.Parser.Types.API.Book; using BookRu.Parser.Types.API.Categories; using BookRu.Parser.Types.API.Sidebar; using Core.Extensions; using Core.Providers.Interfaces; using Core.Types; +using HtmlAgilityPack; using Newtonsoft.Json; using Parser.Core.Configs; -using Parser.Core.Extensions; using Parser.Core.Logic; namespace BookRu.Parser.Logic { @@ -24,11 +23,14 @@ public Parser(IParserConfigBase config, IRepository provider) : base(c protected override string ElsName => "BookRu"; protected override async Task RunInternal(HttpClient client, ISet processed) { - var getBookIdsBlock = new TransformBlock>(async categoryId => await GetBookIds(client, categoryId)); + var nextData = await GetNextData(client); + var buildId = nextData.RootElement.GetProperty("buildId").GetString(); + + var getBookIdsBlock = new TransformBlock>(async categoryId => await GetBookIds(client, buildId, categoryId)); getBookIdsBlock.CompleteMessage(_logger, "Получение каталогов книг закончено. Ждем получения книг."); var filterBlock = new TransformManyBlock, string>(bookIds => Filter(bookIds, processed)); - var getBooksBlock = new TransformBlock(async bookId => await GetBook(client, bookId), GetParserOptions()); + var getBooksBlock = new TransformBlock(async bookId => await GetBook(client, buildId, bookId), GetParserOptions()); getBooksBlock.CompleteMessage(_logger, "Получение книг закончено. Ждем сохранения."); var batchBlock = new BatchBlock(_config.BatchSize); @@ -40,49 +42,69 @@ protected override async Task RunInternal(HttpClient client, I getBooksBlock.LinkTo(batchBlock); batchBlock.LinkTo(saveBookBlock); - foreach (var categoryId in await GetCategoryIds(client)) { + foreach (var categoryId in GetCategoryIds(nextData)) { await getBookIdsBlock.SendAsync(categoryId); } return new IDataflowBlock[] {getBookIdsBlock, filterBlock, getBooksBlock, batchBlock, saveBookBlock}; } + + private async Task GetNextData(HttpClient client) { + var response = await client.GetStringAsync(new Uri("https://www.book.ru/book")); + var doc = new HtmlDocument(); + doc.LoadHtml(response); + + var json = doc.GetElementbyId("__NEXT_DATA__").InnerText; + return JsonDocument.Parse(json); + } private static IEnumerable Filter(IEnumerable bookIds, ISet processed) { return bookIds.Where(processed.Add); } - private static async Task> GetCategoryIds(HttpClient client) { - var sidebar = await client.GetJson>(new Uri("https://www.book.ru/cat/get_sidebar")); - return sidebar?.Data.Content.SelectMany(t => t.Value); + private static IEnumerable GetCategoryIds(JsonDocument nextData) { + var sidebar = JsonConvert.DeserializeObject(nextData + .RootElement.GetProperty("props") + .GetProperty("pageProps") + .GetProperty("serverMenu") + .GetRawText()); + + return sidebar.SelectMany(s => s.Menu); } - private static async Task> GetBookIds(HttpClient client, MenuItem menuItem) { - var data = new { - cat_id = menuItem.Id, - as_view = 3, - years = Array.Empty() - }; - - _logger.Info($"Получаем каталог с ID = {menuItem.Id}, Name = {menuItem.Text}"); + private static async Task> GetBookIds(HttpClient client, string buildId, MenuItem menuItem) { + if (menuItem.Id == "new") { + return Enumerable.Empty(); + } + + _logger.Info($"Получаем каталог с ID = {menuItem.Id}, Name = {menuItem.Name}"); + + var response = await client.GetStringAsync(new Uri($"https://book.ru/_next/data/{buildId}/cat/{menuItem.Id}.json")); + var json = JsonDocument.Parse(response); - var content = new StringContent(JsonConvert.SerializeObject(data), Encoding.UTF8, "application/json"); - var response = await client.PostJson>(new Uri("https://www.book.ru/cat/get_categories"), content); - return response?.Data?.Content?.Select(t => t.Key) ?? Enumerable.Empty(); + return JsonConvert.DeserializeObject(json.RootElement.GetProperty("pageProps").GetProperty("allServerData").GetRawText()).Item.Select(i => i.Id.ToString()); } - private async Task GetBook(HttpClient client, string id) { - var response = await client.GetJson>>(new Uri($"https://www.book.ru/book/get_book/{id}")); - return response?.Data == default || !response.Data.TryGetValue(id, out var book) - ? default - : new BookInfo(id, ElsName) { - Authors = book.Author, - Bib = book.Bib, - ISBN = book.ISBN, - Name = book.Name, - Pages = book.Pages ?? 0, - Year = book.Year, - Publisher = book.Publisher - }; + private async Task GetBook(HttpClient client, string buildId, string id) { + var response = await client.GetStringAsync(new Uri($"https://book.ru/_next/data/{buildId}/book/{id}.json?")); + var json = JsonDocument.Parse(response); + + var items = JsonConvert.DeserializeObject(json.RootElement.GetProperty("pageProps").GetProperty("serverDataBook").GetProperty("item").GetRawText()); + + if (items.Length == 0) { + return default; + } + + var book = items[0]; + return new BookInfo(id, ElsName) { + Authors = book.Author, + Bib = book.BiblioDesc, + ISBN = book.ISBN, + Name = book.Name, + Pages = book.Pages ?? 0, + Year = book.Year, + Publisher = book.Publisher + }; } } } diff --git a/Parsers/BookRu.Parser/Types/API/Book/BookItem.cs b/Parsers/BookRu.Parser/Types/API/Book/BookItem.cs index bb19599..10a8f4d 100644 --- a/Parsers/BookRu.Parser/Types/API/Book/BookItem.cs +++ b/Parsers/BookRu.Parser/Types/API/Book/BookItem.cs @@ -2,11 +2,10 @@ namespace BookRu.Parser.Types.API.Book { public class BookItem { - [JsonProperty("biblio_desc_2")] - public string Bib; + [JsonProperty("biblio_desc")] + public string BiblioDesc; public string Author; public int? Pages; - [JsonProperty("year_norm")] public string Year; [JsonProperty("pub_name")] public string Publisher; diff --git a/Parsers/BookRu.Parser/Types/API/Categories/CategoryContent.cs b/Parsers/BookRu.Parser/Types/API/Categories/CategoryContent.cs index c282a0a..24d8a59 100644 --- a/Parsers/BookRu.Parser/Types/API/Categories/CategoryContent.cs +++ b/Parsers/BookRu.Parser/Types/API/Categories/CategoryContent.cs @@ -1,7 +1,5 @@ -using System.Collections.Generic; - namespace BookRu.Parser.Types.API.Categories { public class CategoryContent { - public Dictionary Content = new(); + public CategoryItem[] Item { get; set; } } } diff --git a/Parsers/BookRu.Parser/Types/API/Categories/CategoryItem.cs b/Parsers/BookRu.Parser/Types/API/Categories/CategoryItem.cs new file mode 100644 index 0000000..d0d14b4 --- /dev/null +++ b/Parsers/BookRu.Parser/Types/API/Categories/CategoryItem.cs @@ -0,0 +1,5 @@ +namespace BookRu.Parser.Types.API.Categories; + +public class CategoryItem { + public long Id { get; set; } +} \ No newline at end of file diff --git a/Parsers/BookRu.Parser/Types/API/Sidebar/MenuItem.cs b/Parsers/BookRu.Parser/Types/API/Sidebar/MenuItem.cs index b6657cc..3f66d37 100644 --- a/Parsers/BookRu.Parser/Types/API/Sidebar/MenuItem.cs +++ b/Parsers/BookRu.Parser/Types/API/Sidebar/MenuItem.cs @@ -1,6 +1,6 @@ namespace BookRu.Parser.Types.API.Sidebar { public class MenuItem { - public long Id; - public string Text; + public string Id { get; set; } + public string Name { get; set; } } } diff --git a/Parsers/BookRu.Parser/Types/API/Sidebar/Sidebar.cs b/Parsers/BookRu.Parser/Types/API/Sidebar/Sidebar.cs index 1627b8f..fa6df9c 100644 --- a/Parsers/BookRu.Parser/Types/API/Sidebar/Sidebar.cs +++ b/Parsers/BookRu.Parser/Types/API/Sidebar/Sidebar.cs @@ -3,7 +3,6 @@ namespace BookRu.Parser.Types.API.Sidebar { public class Sidebar { - [JsonProperty("type_id_content")] - public Dictionary Content = new(); + public MenuItem[] Menu { get; set; } } }