Skip to content

Commit 4cf06c6

Browse files
committed
Content reader options to handle missing files
1 parent d392778 commit 4cf06c6

File tree

12 files changed

+228
-35
lines changed

12 files changed

+228
-35
lines changed

Documentation/malformed-epub/index.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,63 @@ EpubReaderOptions options = new()
3434
};
3535
```
3636

37+
## Missing content files
38+
39+
The [`item` element](https://www.w3.org/publishing/epub32/epub-packages.html#sec-item-elem) within the EPUB manifest has a required `href` attribute which points to a content file in the EPUB archive. There are [some EPUB books](https://github.com/vers-one/EpubReader/issues/25) that declare content files in the EPUB manifest which do not exist in the actual EPUB archive. This causes EpubReader to throw the *"EPUB parsing error: file ... was not found in the EPUB file"* exception. Such exception is thrown immediately, if application uses [`EpubReader.ReadBook`](xref:VersOne.Epub.EpubReader#VersOne_Epub_EpubReader_ReadBook_System_IO_Stream_VersOne_Epub_Options_EpubReaderOptions_) / [`EpubReader.ReadBookAsync`](xref:VersOne.Epub.EpubReader#VersOne_Epub_EpubReader_ReadBookAsync_System_IO_Stream_VersOne_Epub_Options_EpubReaderOptions_) methods because they try to load the whole content of the book into memory. [`EpubReader.OpenBook`](xref:VersOne.Epub.EpubReader#VersOne_Epub_EpubReader_OpenBook_System_IO_Stream_VersOne_Epub_Options_EpubReaderOptions_) and [`EpubReader.OpenBookAsync`](xref:VersOne.Epub.EpubReader#VersOne_Epub_EpubReader_OpenBookAsync_System_IO_Stream_VersOne_Epub_Options_EpubReaderOptions_) methods don't load the content, so the exception will be thrown only during an attempt to call any of those methods for a missing file:
40+
* [`EpubContentFileRef`](xref:VersOne.Epub.EpubContentFileRef) class:
41+
* [`GetContentStream`](xref:VersOne.Epub.EpubContentFileRef#VersOne_Epub_EpubContentFileRef_GetContentStream)
42+
* [`ReadContentAsBytes`](xref:VersOne.Epub.EpubContentFileRef#VersOne_Epub_EpubContentFileRef_ReadContentAsBytes)
43+
* [`ReadContentAsBytesAsync`](xref:VersOne.Epub.EpubContentFileRef#VersOne_Epub_EpubContentFileRef_ReadContentAsBytesAsync)
44+
* [`ReadContentAsText`](xref:VersOne.Epub.EpubContentFileRef#VersOne_Epub_EpubContentFileRef_ReadContentAsText)
45+
* [`ReadContentAsTextAsync`](xref:VersOne.Epub.EpubContentFileRef#VersOne_Epub_EpubContentFileRef_ReadContentAsTextAsync)
46+
* [`EpubByteContentFileRef`](xref:VersOne.Epub.EpubByteContentFileRef) class:
47+
* [`ReadContent`](xref:VersOne.Epub.EpubByteContentFileRef#VersOne_Epub_EpubByteContentFileRef_ReadContent)
48+
* [`ReadContentAsync`](xref:VersOne.Epub.EpubByteContentFileRef#VersOne_Epub_EpubByteContentFileRef_ReadContentAsync)
49+
* [`EpubTextContentFileRef`](xref:VersOne.Epub.EpubTextContentFileRef) class:
50+
* [`ReadContent`](xref:VersOne.Epub.EpubTextContentFileRef#VersOne_Epub_EpubTextContentFileRef_ReadContent)
51+
* [`ReadContentAsync`](xref:VersOne.Epub.EpubTextContentFileRef#VersOne_Epub_EpubTextContentFileRef_ReadContentAsync)
52+
53+
[`ContentReaderOptions.ContentFileMissing`](xref:VersOne.Epub.Options.ContentReaderOptions#VersOne_Epub_Options_ContentReaderOptions_ContentFileMissing) event can be used to detect those issues and to instruct EpubReader how to handle missing content files. Application can choose one of the following options:
54+
55+
### 1. Get notified about missing content files
56+
57+
```csharp
58+
EpubReaderOptions options = new();
59+
options.ContentReaderOptions.ContentFileMissing += (sender, e) =>
60+
{
61+
Console.WriteLine($"Content file is missing: content file name = '{e.FileName}', content file path in the EPUB archive = '{e.FilePathInEpubArchive}', content type = {e.ContentType}, MIME type = {e.ContentMimeType}.");
62+
};
63+
```
64+
65+
This will let application to be notified about the missing content file but will not prevent the exception from being thrown by the EpubReader.
66+
67+
### 2. Suppress exceptions
68+
69+
```csharp
70+
EpubReaderOptions options = new();
71+
options.ContentReaderOptions.ContentFileMissing += (sender, e) =>
72+
{
73+
e.SuppressException = true;
74+
};
75+
```
76+
77+
This will suppress all missing content file exceptions from being thrown. The EpubReader will treat missing content files as existing but empty files.
78+
79+
### 3. Provide a replacement content
80+
81+
```csharp
82+
EpubReaderOptions options = new();
83+
options.ContentReaderOptions.ContentFileMissing += (sender, e) =>
84+
{
85+
if (e.FileName == "chapter1.html")
86+
{
87+
e.ReplacementContentStream = new FileStream(@"C:\Temp\chapter1-replacement.html", FileMode.Open);
88+
}
89+
};
90+
```
91+
92+
This will let application to substitute the content of a missing file with another content. The value of the [`ReplacementContentStream`](xref:VersOne.Epub.Options.ContentFileMissingEventArgs#VersOne_Epub_Options_ContentFileMissingEventArgs_ReplacementContentStream) property can be any [`Stream`](xref:System.IO.Stream). The content of the stream is read only once, after which it will be cached in the EPUB content reader. The stream will be closed after its content is fully read.
93+
3794
## Missing content attribute for EPUB 2 NCX navigation points
3895

3996
The `navPoint` element within the [EPUB 2 NCX navigation document](https://daisy.org/activities/standards/daisy/daisy-3/z39-86-2005-r2012-specifications-for-the-digital-talking-book/#NCX) must contain a nested `content` element pointing to a content file associated with this navigation item. There are some EPUB 2 books that have navigation points without a nested `content` element which causes EpubReader to throw the *"EPUB parsing error: navigation point X should contain content"* exception.

Documentation/templates/default/partials/class.tmpl.partial

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -101,21 +101,7 @@
101101
<h5 class="propertyValue">Property type: {{{type.specName.0.value}}}</h5>
102102
{{/propertyValue}}
103103
{{#eventType}}
104-
<h5 class="eventType">{{__global.eventType}}</h5>
105-
<table class="table table-bordered table-striped table-condensed">
106-
<thead>
107-
<tr>
108-
<th>{{__global.type}}</th>
109-
<th>{{__global.description}}</th>
110-
</tr>
111-
</thead>
112-
<tbody>
113-
<tr>
114-
<td>{{{type.specName.0.value}}}</td>
115-
<td>{{{description}}}</td>
116-
</tr>
117-
</tbody>
118-
</table>
104+
<h5 class="eventType">Event type: {{{type.specName.0.value}}}</h5>
119105
{{/eventType}}
120106
{{/syntax}}
121107
{{#overridden}}

Source/VersOne.Epub.WpfDemo/Models/BookModel.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using System.Collections.Generic;
22
using System.Linq;
33
using System.Threading.Tasks;
4+
using VersOne.Epub.Options;
45
using VersOne.Epub.WpfDemo.Entities;
56
using VersOne.Epub.WpfDemo.ViewModels;
67

@@ -17,7 +18,12 @@ public BookModel()
1718

1819
public async Task<EpubBook> OpenBookAsync(int bookId)
1920
{
20-
EpubBook epubBook = await EpubReader.ReadBookAsync(settings.Books.First(book => book.Id == bookId).FilePath);
21+
EpubReaderOptions epubReaderOptions = new EpubReaderOptions();
22+
epubReaderOptions.ContentReaderOptions.ContentFileMissing += (sender, e) =>
23+
{
24+
e.SuppressException = true;
25+
};
26+
EpubBook epubBook = await EpubReader.ReadBookAsync(settings.Books.First(book => book.Id == bookId).FilePath, epubReaderOptions);
2127
return epubBook;
2228
}
2329

Source/VersOne.Epub/Environment/Implementation/ZipFile.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ public ZipFile(ZipArchive zipArchive)
2020

2121
public IZipFileEntry GetEntry(string entryName)
2222
{
23-
return new ZipFileEntry(zipArchive.GetEntry(entryName));
23+
ZipArchiveEntry zipArchiveEntry = zipArchive.GetEntry(entryName);
24+
return zipArchiveEntry != null ? new ZipFileEntry(zipArchiveEntry) : null;
2425
}
2526

2627
public void Dispose()

Source/VersOne.Epub/EpubReader.cs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,16 +114,20 @@ public static async Task<EpubBook> ReadBookAsync(Stream stream, EpubReaderOption
114114
private static async Task<EpubBookRef> OpenBookAsync(IZipFile zipFile, string filePath, EpubReaderOptions epubReaderOptions)
115115
{
116116
EpubBookRef result = null;
117+
if (epubReaderOptions == null)
118+
{
119+
epubReaderOptions = new EpubReaderOptions();
120+
}
117121
try
118122
{
119123
result = new EpubBookRef(zipFile);
120124
result.FilePath = filePath;
121-
result.Schema = await SchemaReader.ReadSchemaAsync(zipFile, epubReaderOptions ?? new EpubReaderOptions()).ConfigureAwait(false);
125+
result.Schema = await SchemaReader.ReadSchemaAsync(zipFile, epubReaderOptions).ConfigureAwait(false);
122126
result.Title = result.Schema.Package.Metadata.Titles.FirstOrDefault() ?? String.Empty;
123127
result.AuthorList = result.Schema.Package.Metadata.Creators.Select(creator => creator.Creator).ToList();
124128
result.Author = String.Join(", ", result.AuthorList);
125129
result.Description = result.Schema.Package.Metadata.Description;
126-
result.Content = await Task.Run(() => ContentReader.ParseContentMap(result)).ConfigureAwait(false);
130+
result.Content = await Task.Run(() => ContentReader.ParseContentMap(result, epubReaderOptions.ContentReaderOptions)).ConfigureAwait(false);
127131
return result;
128132
}
129133
catch
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
using System;
2+
using System.IO;
3+
4+
namespace VersOne.Epub.Options
5+
{
6+
/// <summary>
7+
/// Provides data for the <see cref="ContentReaderOptions.ContentFileMissing" /> event.
8+
/// </summary>
9+
public class ContentFileMissingEventArgs : EventArgs
10+
{
11+
/// <summary>
12+
/// Initializes a new instance of the <see cref="ContentFileMissingEventArgs" /> class with a specified file name, an absolute file path, a content type of the file,
13+
/// and a MIME type of the file's content.
14+
/// </summary>
15+
/// <param name="fileName">Relative file path of the missing content file (as it is specified in the EPUB manifest).</param>
16+
/// <param name="filePathInEpubArchive">Absolute file path of the missing content file in the EPUB archive.</param>
17+
/// <param name="contentType">The type of the content of the missing file.</param>
18+
/// <param name="contentMimeType">The MIME type of the content of the missing file.</param>
19+
public ContentFileMissingEventArgs(string fileName, string filePathInEpubArchive, EpubContentType contentType, string contentMimeType)
20+
{
21+
FileName = fileName;
22+
FilePathInEpubArchive = filePathInEpubArchive;
23+
ContentType = contentType;
24+
ContentMimeType = contentMimeType;
25+
SuppressException = false;
26+
}
27+
28+
/// <summary>
29+
/// Gets the relative file path of the missing content file (as it is specified in the EPUB manifest).
30+
/// </summary>
31+
public string FileName { get; }
32+
33+
/// <summary>
34+
/// Gets the absolute file path of the missing content file in the EPUB archive.
35+
/// </summary>
36+
public string FilePathInEpubArchive { get; }
37+
38+
/// <summary>
39+
/// Gets the type of the content of the missing file.
40+
/// </summary>
41+
public EpubContentType ContentType { get; }
42+
43+
/// <summary>
44+
/// Gets the MIME type of the content of the missing file.
45+
/// </summary>
46+
public string ContentMimeType { get; }
47+
48+
/// <summary>
49+
/// Gets or sets a value indicating whether the EPUB content reader should suppress the exception for the missing file. If it is set to <c>true</c>
50+
/// and the replacement content stream is not provided (via the <see cref="ReplacementContentStream" /> property), then the EPUB content reader will treat
51+
/// the missing file as an existing but empty file.
52+
/// Default value is <c>false</c>.
53+
/// </summary>
54+
public bool SuppressException { get; set; }
55+
56+
/// <summary>
57+
/// Gets or sets the replacement content stream. This property allows the application to provide a replacement content for the missing file in the form of
58+
/// a <see cref="Stream" />. When the content stream is provided, the EPUB content reader will not throw an exception for the missing file,
59+
/// regardless of the value of the <see cref="SuppressException" /> property. The content of the stream is read only once, after which it will be cached
60+
/// in the EPUB content reader. The stream will be closed after its content is fully read.
61+
/// </summary>
62+
public Stream ReplacementContentStream { get; set; }
63+
}
64+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
using System;
2+
3+
namespace VersOne.Epub.Options
4+
{
5+
/// <summary>
6+
/// Various options to configure the behavior of the EPUB content reader.
7+
/// </summary>
8+
public class ContentReaderOptions
9+
{
10+
/// <summary>
11+
/// Occurs when a content file is listed in the EPUB manifest but the content reader is unable to find it in the EPUB archive.
12+
/// This event lets the application to be notified of such errors and to decide how EPUB content reader should handle the missing file.
13+
/// </summary>
14+
public event EventHandler<ContentFileMissingEventArgs> ContentFileMissing;
15+
16+
internal void RaiseContentFileMissingEvent(ContentFileMissingEventArgs contentFileMissingEventArgs)
17+
{
18+
ContentFileMissing?.Invoke(this, contentFileMissingEventArgs);
19+
}
20+
}
21+
}

Source/VersOne.Epub/Options/EpubReaderOptions.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ public class EpubReaderOptions
1111
public EpubReaderOptions()
1212
{
1313
PackageReaderOptions = new PackageReaderOptions();
14+
ContentReaderOptions = new ContentReaderOptions();
1415
Epub2NcxReaderOptions = new Epub2NcxReaderOptions();
1516
XmlReaderOptions = new XmlReaderOptions();
1617
}
@@ -20,6 +21,11 @@ public EpubReaderOptions()
2021
/// </summary>
2122
public PackageReaderOptions PackageReaderOptions { get; set; }
2223

24+
/// <summary>
25+
/// Gets or sets EPUB content reader options.
26+
/// </summary>
27+
public ContentReaderOptions ContentReaderOptions { get; set; }
28+
2329
/// <summary>
2430
/// Gets or sets EPUB 2 NCX navigation document reader options.
2531
/// </summary>

Source/VersOne.Epub/Readers/ContentReader.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
using System.Collections.Generic;
2+
using VersOne.Epub.Options;
23
using VersOne.Epub.Schema;
34

45
namespace VersOne.Epub.Internal
56
{
67
internal static class ContentReader
78
{
8-
public static EpubContentRef ParseContentMap(EpubBookRef bookRef)
9+
public static EpubContentRef ParseContentMap(EpubBookRef bookRef, ContentReaderOptions contentReaderOptions)
910
{
1011
EpubContentRef result = new EpubContentRef
1112
{
@@ -29,7 +30,7 @@ public static EpubContentRef ParseContentMap(EpubBookRef bookRef)
2930
case EpubContentType.XML:
3031
case EpubContentType.DTBOOK:
3132
case EpubContentType.DTBOOK_NCX:
32-
EpubTextContentFileRef epubTextContentFile = new EpubTextContentFileRef(bookRef, fileName, contentType, contentMimeType);
33+
EpubTextContentFileRef epubTextContentFile = new EpubTextContentFileRef(bookRef, fileName, contentType, contentMimeType, contentReaderOptions);
3334
switch (contentType)
3435
{
3536
case EpubContentType.XHTML_1_1:
@@ -46,7 +47,7 @@ public static EpubContentRef ParseContentMap(EpubBookRef bookRef)
4647
result.AllFiles[fileName] = epubTextContentFile;
4748
break;
4849
default:
49-
EpubByteContentFileRef epubByteContentFile = new EpubByteContentFileRef(bookRef, fileName, contentType, contentMimeType);
50+
EpubByteContentFileRef epubByteContentFile = new EpubByteContentFileRef(bookRef, fileName, contentType, contentMimeType, contentReaderOptions);
5051
switch (contentType)
5152
{
5253
case EpubContentType.IMAGE_GIF:

Source/VersOne.Epub/RefEntities/EpubByteContentFileRef.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Threading.Tasks;
2+
using VersOne.Epub.Options;
23

34
namespace VersOne.Epub
45
{
@@ -16,8 +17,9 @@ public class EpubByteContentFileRef : EpubContentFileRef
1617
/// <param name="fileName">Relative file path of the content file (as it is specified in the EPUB manifest).</param>
1718
/// <param name="contentType">The type of the content of the file.</param>
1819
/// <param name="contentMimeType">The MIME type of the content of the file.</param>
19-
public EpubByteContentFileRef(EpubBookRef epubBookRef, string fileName, EpubContentType contentType, string contentMimeType)
20-
: base(epubBookRef, fileName, contentType, contentMimeType)
20+
/// <param name="contentReaderOptions">Optional content reader options determining how to handle missing content files.</param>
21+
public EpubByteContentFileRef(EpubBookRef epubBookRef, string fileName, EpubContentType contentType, string contentMimeType, ContentReaderOptions contentReaderOptions = null)
22+
: base(epubBookRef, fileName, contentType, contentMimeType, contentReaderOptions)
2123
{
2224
}
2325

0 commit comments

Comments
 (0)