diff --git a/src/bbox.rs b/src/bbox.rs index 22170d6..1d24765 100644 --- a/src/bbox.rs +++ b/src/bbox.rs @@ -7,6 +7,7 @@ use nom::{ number, AsChar, IResult, Needed, }; +pub mod cr3_moov; mod idat; mod iinf; mod iloc; @@ -15,6 +16,8 @@ mod keys; mod meta; mod mvhd; mod tkhd; +mod uuid; +pub use cr3_moov::Cr3MoovBox; pub use ilst::IlstBox; pub use keys::KeysBox; pub use meta::MetaBox; diff --git a/src/bbox/cr3_moov.rs b/src/bbox/cr3_moov.rs new file mode 100644 index 0000000..db2029b --- /dev/null +++ b/src/bbox/cr3_moov.rs @@ -0,0 +1,145 @@ +use std::ops::Range; + +use nom::{combinator::fail, IResult}; + +use super::{ + uuid::{CanonUuidBox, CANON_UUID, UUID_SIZE}, + BoxHolder, +}; + +const MIN_CR3_INPUT_SIZE: usize = 8; + +const MIN_FTYP_BODY_SIZE: usize = 4; + +/// Represents the parsed moov box structure for Canon CR3 files. +/// +/// Canon CR3 files are based on the ISO Base Media File Format (similar to MP4/MOV) +/// but contain Canon-specific metadata in a UUID box within the moov container. +/// This struct provides access to the Canon UUID box containing EXIF metadata. +/// +/// # CR3 File Structure +/// CR3 File +/// +-- ftyp (file type box) +/// +-- moov (movie box) +/// | +-- uuid (Canon UUID box) +/// | +-- CMT1 (main EXIF data) +/// | +-- CMT2 (ExifIFD data) +/// | +-- CMT3 (MakerNotes data) +/// +-- mdat (media data) +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Cr3MoovBox { + /// Canon's UUID box containing CMT metadata, if present + uuid_canon_box: Option, +} + +impl Cr3MoovBox { + pub fn parse(input: &[u8]) -> IResult<&[u8], Option> { + // Validate minimum input size + if input.len() < MIN_CR3_INPUT_SIZE { + tracing::warn!( + "Input too small for CR3 parsing: {} bytes, expected at least {}", + input.len(), + MIN_CR3_INPUT_SIZE + ); + return fail(input); + } + + let remain = input; + let (remain, bbox) = BoxHolder::parse(remain)?; + + // Verify this is a valid file format by checking for ftyp box + if bbox.box_type() != "ftyp" { + tracing::warn!("Expected ftyp box, found: {}", bbox.box_type()); + return fail(input); + } + + // Validate ftyp box has minimum required size + if bbox.body_data().len() < MIN_FTYP_BODY_SIZE { + tracing::warn!( + "ftyp box too small: {} bytes, expected at least {}", + bbox.body_data().len(), + MIN_FTYP_BODY_SIZE + ); + return fail(input); + } + + // Find the moov box containing the metadata + let (remain, Some(moov_bbox)) = super::find_box(remain, "moov")? else { + tracing::debug!("moov box not found in CR3 file"); + return Ok((remain, None)); + }; + + tracing::debug!( + box_type = moov_bbox.box_type(), + size = moov_bbox.header.box_size, + "Found moov box in CR3 file" + ); + + // Parse the moov box contents to find Canon UUID box + let (_, moov_box) = Self::parse_moov_content(moov_bbox.body_data(), input)?; + tracing::debug!(?moov_box, "Successfully parsed CR3 moov box"); + + Ok((remain, Some(moov_box))) + } + + fn parse_moov_content<'a>( + moov_data: &'a [u8], + full_input: &'a [u8], + ) -> IResult<&'a [u8], Cr3MoovBox> { + let mut remain = moov_data; + let mut uuid_canon_box = None; + + // Iterate through all boxes within the moov box to find Canon's UUID box + while !remain.is_empty() { + let (new_remain, bbox) = match BoxHolder::parse(remain) { + Ok(result) => result, + Err(e) => { + tracing::warn!( + "Failed to parse box in moov content, continuing with partial data: {:?}", + e + ); + break; // Stop parsing but return what we found so far + } + }; + + if bbox.box_type() == "uuid" { + let body_data = bbox.body_data(); + + // Validate UUID box has minimum required size + if body_data.len() < UUID_SIZE { + tracing::debug!("UUID box too small: {} bytes", body_data.len()); + remain = new_remain; + continue; + } + + let uuid_bytes = &body_data[0..UUID_SIZE]; + + if uuid_bytes == CANON_UUID { + tracing::debug!( + "Found Canon UUID box with {} bytes of data", + body_data.len() + ); + let (_, canon_box) = CanonUuidBox::parse(body_data, full_input)?; + uuid_canon_box = Some(canon_box); + break; + } else { + tracing::debug!("Found non-Canon UUID box"); + } + } + + remain = new_remain; + } + + Ok((remain, Cr3MoovBox { uuid_canon_box })) + } + + #[allow(dead_code)] // API method for tests + pub fn uuid_canon_box(&self) -> Option<&CanonUuidBox> { + self.uuid_canon_box.as_ref() + } + + pub fn exif_data_offset(&self) -> Option> { + // For CR3, we primarily use CMT1 which contains the main EXIF IFD0 data + self.uuid_canon_box.as_ref()?.exif_data_offset().cloned() + } +} diff --git a/src/bbox/meta.rs b/src/bbox/meta.rs index 5fae31e..b74a635 100644 --- a/src/bbox/meta.rs +++ b/src/bbox/meta.rs @@ -78,6 +78,7 @@ impl ParseBody for MetaBox { } impl MetaBox { + #[allow(dead_code)] #[tracing::instrument(skip_all)] pub fn exif_data<'a>(&self, input: &'a [u8]) -> IResult<&'a [u8], Option<&'a [u8]>> { self.iinf diff --git a/src/bbox/uuid.rs b/src/bbox/uuid.rs new file mode 100644 index 0000000..e1af624 --- /dev/null +++ b/src/bbox/uuid.rs @@ -0,0 +1,212 @@ +use std::ops::Range; + +use nom::IResult; + +use super::BoxHolder; +use crate::exif::TiffHeader; + +/// Size of a UUID in bytes +pub const UUID_SIZE: usize = 16; + +/// Canon CMT box types +const CMT_BOX_TYPES: &[&str] = &["CMT1", "CMT2", "CMT3"]; + +/// Canon's UUID for CR3 files: 85c0b687-820f-11e0-8111-f4ce462b6a48 +pub const CANON_UUID: [u8; 16] = [ + 0x85, 0xc0, 0xb6, 0x87, 0x82, 0x0f, 0x11, 0xe0, 0x81, 0x11, 0xf4, 0xce, 0x46, 0x2b, 0x6a, 0x48, +]; + +/// Represents Canon's UUID box containing CMT (Canon Metadata) boxes. +/// +/// Canon CR3 files store EXIF metadata in a proprietary UUID box format. +/// The UUID box contains three CMT (Canon Metadata) sub-boxes: +/// - CMT1: Main EXIF IFD0 data (camera settings, basic metadata) +/// - CMT2: ExifIFD data (detailed EXIF information) +/// - CMT3: MakerNotes data (Canon-specific metadata) +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CanonUuidBox { + /// CMT1 contains the main EXIF IFD0 data (primary metadata) + cmt1_offset: Option>, + /// CMT2 contains the ExifIFD data (detailed EXIF information) + cmt2_offset: Option>, + /// CMT3 contains the MakerNotes data (Canon-specific metadata) + cmt3_offset: Option>, +} + +impl CanonUuidBox { + /// Returns the offset range for the primary EXIF data (CMT1). + pub fn exif_data_offset(&self) -> Option<&Range> { + // For CR3, we primarily use CMT1 which contains the main EXIF IFD0 data + self.cmt1_offset.as_ref() + } + + /// Returns the offset range for the ExifIFD data (CMT2). + #[allow(dead_code)] // API method for future use + pub fn cmt2_data_offset(&self) -> Option<&Range> { + self.cmt2_offset.as_ref() + } + + /// Returns the offset range for the MakerNotes data (CMT3). + #[allow(dead_code)] // API method for future use + pub fn cmt3_data_offset(&self) -> Option<&Range> { + self.cmt3_offset.as_ref() + } + + /// Parses Canon's UUID box to extract CMT (Canon Metadata) box offsets. + pub fn parse<'a>(uuid_data: &'a [u8], full_input: &'a [u8]) -> IResult<&'a [u8], CanonUuidBox> { + // Validate input sizes + if uuid_data.len() < UUID_SIZE { + tracing::error!( + "Canon UUID box data too small: {} bytes, expected at least {}", + uuid_data.len(), + UUID_SIZE + ); + return nom::combinator::fail(uuid_data); + } + + if full_input.is_empty() { + tracing::error!("Full input is empty for Canon UUID box parsing"); + return nom::combinator::fail(uuid_data); + } + + // Skip the UUID header + let mut remain = &uuid_data[UUID_SIZE..]; + let mut cmt1_offset = None; + let mut cmt2_offset = None; + let mut cmt3_offset = None; + + tracing::debug!( + "Parsing Canon UUID box with {} bytes of CMT data", + remain.len() + ); + + // Parse CMT boxes within the Canon UUID box + while !remain.is_empty() { + let (new_remain, bbox) = match BoxHolder::parse(remain) { + Ok(result) => result, + Err(e) => { + tracing::warn!( + "Failed to parse CMT box, continuing with partial data: {:?}", + e + ); + break; // Stop parsing but return what we found so far + } + }; + + let box_type = bbox.box_type(); + if CMT_BOX_TYPES.contains(&box_type) { + // Calculate offset safely using slice bounds checking + let data_start = bbox.data.as_ptr() as usize; + let input_start = full_input.as_ptr() as usize; + + // Ensure the data pointer is within the input bounds + if data_start < input_start || data_start >= input_start + full_input.len() { + tracing::warn!("CMT box data pointer outside input bounds"); + remain = new_remain; + continue; + } + + let start_offset = data_start - input_start; + let body_start = start_offset + bbox.header_size(); + let body_end = start_offset + bbox.data.len(); + + // Validate offset ranges are within bounds + if body_end > full_input.len() { + tracing::warn!( + "CMT box body extends beyond input bounds: {}..{} > {}", + body_start, + body_end, + full_input.len() + ); + remain = new_remain; + continue; + } + + let offset_range = body_start..body_end; + + // Validate CMT box data has minimum size and reasonable content + let cmt_data = &full_input[offset_range.clone()]; + if !Self::validate_cmt_data(box_type, cmt_data) { + tracing::warn!("CMT box {} failed validation, skipping", box_type); + remain = new_remain; + continue; + } + + match box_type { + "CMT1" => { + cmt1_offset = Some(offset_range); + tracing::debug!("Found CMT1 (IFD0) at offset {}..{}", body_start, body_end); + } + "CMT2" => { + cmt2_offset = Some(offset_range); + tracing::debug!( + "Found CMT2 (ExifIFD) at offset {}..{}", + body_start, + body_end + ); + } + "CMT3" => { + cmt3_offset = Some(offset_range); + tracing::debug!( + "Found CMT3 (MakerNotes) at offset {}..{}", + body_start, + body_end + ); + } + _ => unreachable!("box_type should be one of CMT1, CMT2, or CMT3"), + } + } else { + // Skip unknown boxes within Canon UUID + tracing::debug!("Skipping unknown box type: {}", box_type); + } + + remain = new_remain; + } + + Ok(( + remain, + CanonUuidBox { + cmt1_offset, + cmt2_offset, + cmt3_offset, + }, + )) + } + + /// Validates CMT box data for basic integrity. + fn validate_cmt_data(box_type: &str, data: &[u8]) -> bool { + // Minimum size check - CMT boxes should have at least 8 bytes + if data.len() < 8 { + tracing::warn!("CMT box {} too small: {} bytes", box_type, data.len()); + return false; + } + + match box_type { + "CMT1" => { + // CMT1 should start with TIFF header - validate using TiffHeader::parse + if TiffHeader::parse(data).is_ok() { + tracing::debug!("CMT1 has valid TIFF header"); + true + } else { + tracing::warn!("CMT1 does not have valid TIFF header"); + false + } + } + "CMT2" | "CMT3" => { + // CMT2 and CMT3 should also be TIFF format, but we're more lenient + // since they might have different internal structures + if data.len() >= 8 { + tracing::debug!("CMT box {} has sufficient size", box_type); + true + } else { + tracing::warn!("CMT box {} too small for valid data", box_type); + false + } + } + _ => { + tracing::warn!("Unknown CMT box type: {}", box_type); + false + } + } + } +} diff --git a/src/cr3.rs b/src/cr3.rs new file mode 100644 index 0000000..60a8710 --- /dev/null +++ b/src/cr3.rs @@ -0,0 +1,141 @@ +use nom::IResult; + +use crate::{ + bbox::Cr3MoovBox, + error::{nom_error_to_parsing_error_with_state, ParsingError, ParsingErrorState}, + exif::{check_exif_header2, TiffHeader}, + parser::ParsingState, +}; + +pub(crate) fn parse_moov_box(input: &[u8]) -> IResult<&[u8], Option> { + Cr3MoovBox::parse(input) +} + +pub(crate) fn extract_exif_data( + state: Option, + buf: &[u8], +) -> Result<(Option<&[u8]>, Option), ParsingErrorState> { + let (data, state) = match state { + Some(ParsingState::Cr3ExifSize(size)) => { + let (_, data) = nom::bytes::streaming::take(size)(buf) + .map_err(|e| nom_error_to_parsing_error_with_state(e, state.clone()))?; + (Some(data), state) + } + None => { + let (_, moov) = + parse_moov_box(buf).map_err(|e| nom_error_to_parsing_error_with_state(e, state))?; + + if let Some(moov) = moov { + if let Some(range) = moov.exif_data_offset() { + if range.end > buf.len() { + let state = ParsingState::Cr3ExifSize(range.len()); + let clear_and_skip = ParsingError::ClearAndSkip(range.start); + return Err(ParsingErrorState::new(clear_and_skip, Some(state))); + } else { + (Some(&buf[range]), None) + } + } else { + return Err(ParsingErrorState::new( + ParsingError::Failed( + "CR3 file contains no EXIF data: Canon UUID box found but no CMT1 offset available".into(), + ), + None, + )); + } + } else { + (None, None) + } + } + _ => unreachable!(), + }; + + // For CR3 files, the CMT1 data already contains TIFF header, so we don't need to check for EXIF header + let data = data.and_then(|x| { + if TiffHeader::parse(x).is_ok() { + Some(x) + } else { + // Try to find TIFF header if not at the beginning + check_exif_header2(x).map(|x| x.0).ok() + } + }); + + Ok((data, state)) +} + +#[cfg(test)] +mod tests { + use crate::bbox::Cr3MoovBox; + use crate::testkit::*; + use crate::{MediaParser, MediaSource}; + use std::io::Read; + use test_case::test_case; + + #[test_case("canon-r6.cr3")] + fn cr3_parse_with_media_parser(path: &str) { + let _ = tracing_subscriber::fmt().with_test_writer().try_init(); + + let mut parser = MediaParser::new(); + let ms = MediaSource::file_path(format!("testdata/{}", path)).unwrap(); + assert!(ms.has_exif()); + + let iter: crate::ExifIter = parser.parse(ms).unwrap(); + let exif: crate::Exif = iter.into(); + + let mut expect = String::new(); + open_sample(&format!("{path}.sorted.txt")) + .unwrap() + .read_to_string(&mut expect) + .unwrap(); + + assert_eq!(sorted_exif_entries(&exif).join("\n"), expect.trim()); + } + + #[test_case("canon-r6.cr3")] + fn cr3_moov_box_parsing(path: &str) { + let _ = tracing_subscriber::fmt().with_test_writer().try_init(); + + let buf = read_sample(path).unwrap(); + let (_, moov_box) = Cr3MoovBox::parse(&buf[..]).unwrap(); + + assert!(moov_box.is_some(), "Moov box should be found"); + let moov_box = moov_box.unwrap(); + + let canon_box = moov_box.uuid_canon_box().unwrap(); + + assert!( + canon_box.exif_data_offset().is_some(), + "CMT1 box should be found" + ); + assert!( + canon_box.cmt2_data_offset().is_some(), + "CMT2 box should be found" + ); + assert!( + canon_box.cmt3_data_offset().is_some(), + "CMT3 box should be found" + ); + + // Verify the offsets are reasonable + let cmt1 = canon_box.exif_data_offset().unwrap(); + assert!(cmt1.start < cmt1.end, "CMT1 offset range should be valid"); + assert!( + cmt1.end <= buf.len(), + "CMT1 offset should be within file bounds" + ); + } + + #[test_case("canon-r6.cr3")] + fn test_cmt_api_access(path: &str) { + let _ = tracing_subscriber::fmt().with_test_writer().try_init(); + + let buf = read_sample(path).unwrap(); + let (_, moov_box) = Cr3MoovBox::parse(&buf[..]).unwrap(); + let moov_box = moov_box.expect("Should have moov box"); + + // Test CMT1 access (should be available) + assert!( + moov_box.exif_data_offset().is_some(), + "Should have CMT1 data" + ); + } +} diff --git a/src/exif.rs b/src/exif.rs index 870763a..cae9e32 100644 --- a/src/exif.rs +++ b/src/exif.rs @@ -4,26 +4,28 @@ use crate::parser::{BufParser, ParsingState, ShareBuf}; use crate::raf::RafInfo; use crate::skip::Skip; use crate::slice::SubsliceRange; -use crate::{heif, jpeg, MediaParser, MediaSource}; +use crate::{cr3, heif, jpeg, MediaParser, MediaSource}; #[allow(deprecated)] use crate::{partial_vec::PartialVec, FileFormat}; pub use exif_exif::Exif; -use exif_exif::{check_exif_header2, TIFF_HEADER_LEN}; +use exif_exif::TIFF_HEADER_LEN; use exif_iter::input_into_iter; pub use exif_iter::{ExifIter, ParsedExifEntry}; pub use gps::{GPSInfo, LatLng}; +pub use multi_exif::{DuplicateStrategy, MultiExifIter}; pub use tags::ExifTag; use std::io::Read; use std::ops::Range; pub(crate) mod ifd; -pub(crate) use exif_exif::{check_exif_header, TiffHeader}; +pub(crate) use exif_exif::{check_exif_header, check_exif_header2, TiffHeader}; pub(crate) use travel::IfdHeaderTravel; mod exif_exif; mod exif_iter; mod gps; +mod multi_exif; mod tags; mod travel; @@ -79,6 +81,42 @@ pub(crate) fn parse_exif_iter>( range_to_iter(parser, out) } +#[tracing::instrument(skip(reader))] +pub(crate) fn parse_multi_exif_iter>( + parser: &mut MediaParser, + mime_img: MimeImage, + reader: &mut R, +) -> Result { + if mime_img != MimeImage::Cr3 { + return Err(format!("MultiExifIter is not supported for {mime_img:?}").into()); + } + + let mut iter = MultiExifIter::new(DuplicateStrategy::IgnoreDuplicates); + + // TODO: The following is only demonstration code. + // Please make further modifications based on the CR3 file structure. + // For example, the `parse` callback of `load_and_parse` should be reimplemented + // to correctly parse the next CMT* box. + + loop { + let out = parser.load_and_parse::(reader, |buf, state| { + extract_exif_range(mime_img, buf, state) + })?; + if out.is_none() { + break; + } + + // TODO: The current `block_id` should be returned via the `load_and_parse` call. + let block_id = "CMT1"; + let data = out + .map(|(range, _)| parser.share_buf(range)) + .ok_or_else(|| format!("Exif not found in block {block_id}"))?; + iter.add_tiff_data(block_id.to_owned(), data, None); + } + + Ok(iter) +} + type ExifRangeResult = Result, Option)>, ParsingErrorState>; fn extract_exif_range(img: MimeImage, buf: &[u8], state: Option) -> ExifRangeResult { @@ -86,6 +124,7 @@ fn extract_exif_range(img: MimeImage, buf: &[u8], state: Option) - let header = state.and_then(|x| match x { ParsingState::TiffHeader(h) => Some(h), ParsingState::HeifExifSize(_) => None, + ParsingState::Cr3ExifSize(_) => None, }); Ok(exif_data .and_then(|x| buf.subslice_in_range(x)) @@ -174,6 +213,7 @@ pub(crate) fn extract_exif_with_mime( MimeImage::Raf => RafInfo::parse(buf) .map(|res| (res.1.exif_data, state.clone())) .map_err(|e| nom_error_to_parsing_error_with_state(e, state))?, + MimeImage::Cr3 => cr3_extract_exif(state, buf)?, }; Ok((exif_data, state)) } @@ -182,41 +222,14 @@ fn heif_extract_exif( state: Option, buf: &[u8], ) -> Result<(Option<&[u8]>, Option), ParsingErrorState> { - let (data, state) = match state { - Some(ParsingState::HeifExifSize(size)) => { - let (_, data) = nom::bytes::streaming::take(size)(buf) - .map_err(|e| nom_error_to_parsing_error_with_state(e, state.clone()))?; - (Some(data), state) - } - None => { - let (_, meta) = heif::parse_meta_box(buf) - .map_err(|e| nom_error_to_parsing_error_with_state(e, state))?; - - if let Some(meta) = meta { - if let Some(range) = meta.exif_data_offset() { - if range.end > buf.len() { - let state = ParsingState::HeifExifSize(range.len()); - let clear_and_skip = ParsingError::ClearAndSkip(range.start); - return Err(ParsingErrorState::new(clear_and_skip, Some(state))); - } else { - (Some(&buf[range]), None) - } - } else { - return Err(ParsingErrorState::new( - ParsingError::Failed("no exif offset in meta box".into()), - None, - )); - } - } else { - (None, None) - } - } - _ => unreachable!(), - }; - - let data = data.and_then(|x| check_exif_header2(x).map(|x| x.0).ok()); + heif::extract_exif_data(state, buf) +} - Ok((data, state)) +fn cr3_extract_exif( + state: Option, + buf: &[u8], +) -> Result<(Option<&[u8]>, Option), ParsingErrorState> { + cr3::extract_exif_data(state, buf) } #[cfg(feature = "async")] diff --git a/src/exif/multi_exif.rs b/src/exif/multi_exif.rs new file mode 100644 index 0000000..e1d3af3 --- /dev/null +++ b/src/exif/multi_exif.rs @@ -0,0 +1,563 @@ +use std::collections::HashSet; + +use crate::{partial_vec::PartialVec, values::EntryValue, ExifTag}; + +use super::{ + exif_iter::{input_into_iter, ExifIter, ParsedExifEntry}, + TiffHeader, +}; + +/// Strategy for handling duplicate tags across multiple TIFF blocks +#[derive(Debug, Clone, Copy)] +#[allow(dead_code)] +pub enum DuplicateStrategy { + /// Ignore duplicate tags (skip subsequent occurrences) + IgnoreDuplicates, + /// Allow duplicate tags (emit all occurrences) + AllowDuplicates, +} + +/// A TIFF data source for lazy loading +struct TiffDataSource { + /// Block identifier + block_id: String, + /// Data loader function (lazy loading) + data_loader: Box crate::Result + Send + Sync>, + /// TIFF header information (optional, if known) + header: Option, + /// Whether loading has been attempted + load_attempted: bool, +} + +/// An iterator for multiple TIFF/Exif data blocks with lazy loading support. +/// +/// This is designed for files like Canon CR3 that contain multiple TIFF data +/// blocks (e.g., in CMT1/CMT2 boxes) that need to be processed together. +pub struct MultiExifIter { + /// TIFF data sources (lazy loading) + tiff_sources: Vec, + /// Current TIFF block index being iterated + current_block_index: usize, + /// Currently loaded ExifIter (created only when needed) + current_iter: Option, + /// Tag handling strategy for duplicates + duplicate_strategy: DuplicateStrategy, + /// Set of encountered tags for duplicate detection (ifd_index, tag_code) + encountered_tags: HashSet<(usize, u16)>, +} + +/// A parsed EXIF entry from multiple TIFF blocks +pub struct MultiExifParsedEntry { + /// The original parsed entry + inner: ParsedExifEntry, + /// Source TIFF block identifier + source_block_id: String, + /// Source TIFF block index + source_block_index: usize, +} + +#[allow(dead_code)] +impl MultiExifParsedEntry { + /// Get the source TIFF block identifier + pub fn source_block_id(&self) -> &str { + &self.source_block_id + } + + /// Get the source TIFF block index + pub fn source_block_index(&self) -> usize { + self.source_block_index + } + + /// Get the IFD index value where this entry is located. + /// - 0: ifd0 (main image) + /// - 1: ifd1 (thumbnail) + pub fn ifd_index(&self) -> usize { + self.inner.ifd_index() + } + + /// Get recognized Exif tag of this entry, maybe return `None` if the tag + /// is unrecognized. + pub fn tag(&self) -> Option { + self.inner.tag() + } + + /// Get the raw tag code of this entry. + pub fn tag_code(&self) -> u16 { + self.inner.tag_code() + } + + /// Returns true if there is an `EntryValue` in self. + pub fn has_value(&self) -> bool { + self.inner.has_value() + } + + /// Get the parsed entry value of this entry. + pub fn get_value(&self) -> Option<&EntryValue> { + self.inner.get_value() + } + + /// Takes out the parsed entry value of this entry. + /// + /// **Note**: This method can only be called once! Once it has been called, + /// calling it again always returns `None`. + pub fn take_value(&mut self) -> Option { + self.inner.take_value() + } + + /// Get the parsed result of this entry. + pub fn get_result(&self) -> Result<&EntryValue, &super::exif_iter::EntryError> { + self.inner.get_result() + } + + /// Takes out the parsed result of this entry. + /// + /// **Note**: This method can ONLY be called once! If you call it twice, it + /// will **panic** directly! + pub fn take_result(&mut self) -> Result { + self.inner.take_result() + } +} + +impl std::fmt::Debug for MultiExifParsedEntry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MultiParsedExifEntry") + .field("source_block_id", &self.source_block_id) + .field("source_block_index", &self.source_block_index) + .field("inner", &self.inner) + .finish() + } +} + +#[allow(dead_code)] +impl MultiExifIter { + /// Create a new multi-TIFF iterator with the specified duplicate handling strategy + pub fn new(strategy: DuplicateStrategy) -> Self { + Self { + tiff_sources: Vec::new(), + current_block_index: 0, + current_iter: None, + duplicate_strategy: strategy, + encountered_tags: HashSet::new(), + } + } + + /// Add a TIFF data source with lazy loading + /// + /// # Arguments + /// * `block_id` - Identifier for this TIFF block (e.g., "CMT1", "CMT2") + /// * `loader` - Function that returns TIFF data when called + /// * `header` - Optional TIFF header if already parsed + pub fn add_tiff_source(&mut self, block_id: String, loader: F, header: Option) + where + F: Fn() -> crate::Result + Send + Sync + 'static, + { + self.tiff_sources.push(TiffDataSource { + block_id, + data_loader: Box::new(loader), + header, + load_attempted: false, + }); + } + + /// Add already available TIFF data (immediately usable) + /// + /// # Arguments + /// * `block_id` - Identifier for this TIFF block + /// * `data` - TIFF data + /// * `header` - Optional TIFF header if already parsed + pub fn add_tiff_data( + &mut self, + block_id: String, + data: PartialVec, + header: Option, + ) { + self.add_tiff_source(block_id, move || Ok(data.clone()), header); + } + + /// Get the number of TIFF blocks + pub fn block_count(&self) -> usize { + self.tiff_sources.len() + } + + /// Get current block information (block_id, block_index) + pub fn current_block_info(&self) -> Option<(&str, usize)> { + if self.current_block_index < self.tiff_sources.len() { + Some(( + &self.tiff_sources[self.current_block_index].block_id, + self.current_block_index, + )) + } else { + None + } + } + + /// Reset the iterator to the beginning + pub fn rewind(&mut self) { + self.current_block_index = 0; + self.current_iter = None; + self.encountered_tags.clear(); + + // Reset load_attempted flags + for source in &mut self.tiff_sources { + source.load_attempted = false; + } + } + + /// Load the next TIFF block and create an ExifIter for it + fn load_next_block(&mut self) -> crate::Result { + if self.current_block_index >= self.tiff_sources.len() { + return Err("No more TIFF blocks to load".into()); + } + + let source = &mut self.tiff_sources[self.current_block_index]; + if source.load_attempted { + return Err("Block already failed to load".into()); + } + + source.load_attempted = true; + let data = (source.data_loader)()?; + tracing::debug!( + block_id = source.block_id, + block_index = self.current_block_index, + data_len = data.len(), + "Loading TIFF block" + ); + + match input_into_iter(data, source.header.clone()) { + Ok(iter) => { + tracing::debug!( + "Successfully created ExifIter for block {}", + source.block_id + ); + Ok(iter) + } + Err(e) => { + tracing::warn!( + block_id = source.block_id, + error = %e, + "Failed to create ExifIter for block" + ); + Err(e) + } + } + } +} + +impl Clone for MultiExifIter { + fn clone(&self) -> Self { + // Clone the iterator and reset to beginning + Self { + tiff_sources: self + .tiff_sources + .iter() + .map(|source| { + TiffDataSource { + block_id: source.block_id.clone(), + // Note: We can't clone the Box, so we create a new one that always fails + // This is a limitation - cloned iterators won't work with lazy loading + data_loader: Box::new(|| Err("Cannot clone lazy loader".into())), + header: source.header.clone(), + load_attempted: false, + } + }) + .collect(), + current_block_index: 0, + current_iter: None, + duplicate_strategy: self.duplicate_strategy, + encountered_tags: HashSet::new(), + } + } +} + +impl Iterator for MultiExifIter { + type Item = MultiExifParsedEntry; + + fn next(&mut self) -> Option { + loop { + // If current block iterator exists, try to get next entry + if let Some(ref mut current_iter) = self.current_iter { + for entry in current_iter.by_ref() { + // Check duplicate strategy + let tag_key = (entry.ifd_index(), entry.tag_code()); + let should_include = match self.duplicate_strategy { + DuplicateStrategy::IgnoreDuplicates => { + if self.encountered_tags.contains(&tag_key) { + false // Skip duplicate tag + } else { + self.encountered_tags.insert(tag_key); + true + } + } + DuplicateStrategy::AllowDuplicates => { + // Always allow, just record the tag + self.encountered_tags.insert(tag_key); + true + } + }; + + if should_include { + return Some(MultiExifParsedEntry { + inner: entry, + source_block_id: self.tiff_sources[self.current_block_index] + .block_id + .clone(), + source_block_index: self.current_block_index, + }); + } + // If tag should be skipped, continue to next entry + } + + // Current iterator is exhausted, move to next block + self.current_iter = None; + self.current_block_index += 1; + } + + // No current iterator, need to load the next block + if self.current_block_index >= self.tiff_sources.len() { + return None; // All blocks have been iterated + } + + // Lazy load the current TIFF block + match self.load_next_block() { + Ok(iter) => { + tracing::debug!( + block_index = self.current_block_index, + block_id = self.tiff_sources[self.current_block_index].block_id, + "Successfully loaded TIFF block" + ); + self.current_iter = Some(iter); + // Continue the loop to get entries from the new block + } + Err(e) => { + tracing::warn!( + block_index = self.current_block_index, + block_id = self.tiff_sources[self.current_block_index].block_id, + error = %e, + "Failed to load TIFF block, skipping" + ); + // Move to next block + self.current_block_index += 1; + } + } + } + } +} + +impl std::fmt::Debug for MultiExifIter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MultiExifIter") + .field("block_count", &self.tiff_sources.len()) + .field("current_block_index", &self.current_block_index) + .field("duplicate_strategy", &self.duplicate_strategy) + .field("encountered_tags_count", &self.encountered_tags.len()) + .field("has_current_iter", &self.current_iter.is_some()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::*; + use crate::partial_vec::PartialVec; + + #[test] + fn test_multi_exif_iter_creation() { + let iter = MultiExifIter::new(DuplicateStrategy::IgnoreDuplicates); + assert_eq!(iter.block_count(), 0); + assert!(iter.current_block_info().is_none()); + } + + #[test] + fn test_add_tiff_data() { + let mut iter = MultiExifIter::new(DuplicateStrategy::IgnoreDuplicates); + let data = PartialVec::from(vec![0u8; 100]); + + iter.add_tiff_data("test_block".to_string(), data, None); + assert_eq!(iter.block_count(), 1); + } + + #[test] + fn test_add_tiff_source() { + let mut iter = MultiExifIter::new(DuplicateStrategy::IgnoreDuplicates); + + iter.add_tiff_source( + "test_block".to_string(), + || Ok(PartialVec::from(vec![0u8; 100])), + None, + ); + + assert_eq!(iter.block_count(), 1); + } + + #[test] + fn test_multi_tiff_with_real_data() { + use super::super::extract_exif_with_mime; + use crate::file::MimeImage; + use crate::slice::SubsliceRange; + use crate::testkit::read_sample; + + let mut iter = MultiExifIter::new(DuplicateStrategy::IgnoreDuplicates); + + // Use real TIFF data from test files + let tiff_buf = read_sample("tif.tif").unwrap(); + let (tiff_data, _) = extract_exif_with_mime(MimeImage::Tiff, &tiff_buf, None).unwrap(); + + if let Some(exif_range) = tiff_data.and_then(|x| tiff_buf.subslice_in_range(x)) { + let exif_data = &tiff_buf[exif_range]; + + // Add the same TIFF data twice as different blocks + iter.add_tiff_data( + "CMT1".to_string(), + PartialVec::from(exif_data.to_vec()), + None, + ); + iter.add_tiff_data( + "CMT2".to_string(), + PartialVec::from(exif_data.to_vec()), + None, + ); + + assert_eq!(iter.block_count(), 2); + + let mut entries = Vec::new(); + for entry in &mut iter { + println!( + "Real data test - Got entry: block={}, tag={:04x}", + entry.source_block_id(), + entry.tag_code() + ); + assert_eq!(entry.source_block_id, "CMT1"); + entries.push(entry); + } + + println!("Real data test - Total entries: {}", entries.len()); + // With IgnoreDuplicates strategy, we should get entries only from the first block + assert!(!entries.is_empty(), "Should have at least some entries"); + } else { + panic!("Failed to extract TIFF data from test file"); + } + } + + #[test] + fn test_duplicate_strategies() { + let ignore_iter = MultiExifIter::new(DuplicateStrategy::IgnoreDuplicates); + let allow_iter = MultiExifIter::new(DuplicateStrategy::AllowDuplicates); + + assert!(matches!( + ignore_iter.duplicate_strategy, + DuplicateStrategy::IgnoreDuplicates + )); + assert!(matches!( + allow_iter.duplicate_strategy, + DuplicateStrategy::AllowDuplicates + )); + } + + #[test] + fn test_multi_tiff_with_allow_duplicates_strategy() { + use super::super::extract_exif_with_mime; + use crate::file::MimeImage; + use crate::slice::SubsliceRange; + use crate::testkit::read_sample; + + let mut iter = MultiExifIter::new(DuplicateStrategy::AllowDuplicates); + + // Use real TIFF data from test files + let tiff_buf = read_sample("tif.tif").unwrap(); + let (tiff_data, _) = extract_exif_with_mime(MimeImage::Tiff, &tiff_buf, None).unwrap(); + + if let Some(exif_range) = tiff_data.and_then(|x| tiff_buf.subslice_in_range(x)) { + let exif_data = &tiff_buf[exif_range]; + + // Add the same TIFF data twice as different blocks + iter.add_tiff_data( + "CMT1".to_string(), + PartialVec::from(exif_data.to_vec()), + None, + ); + iter.add_tiff_data( + "CMT2".to_string(), + PartialVec::from(exif_data.to_vec()), + None, + ); + + assert_eq!(iter.block_count(), 2); + + let mut entries = HashMap::new(); + for entry in &mut iter { + println!( + "Overwrite test - Got entry: block={}, tag={:04x}", + entry.source_block_id(), + entry.tag_code() + ); + entries.insert((entry.ifd_index(), entry.tag_code()), entry); + } + + println!("Allow duplicates test - Total entries: {}", entries.len()); + // With AllowDuplicates strategy, we should get entries from both blocks + assert!(!entries.is_empty(), "Should have at least some entries"); + + let block_ids: std::collections::HashSet<_> = + entries.iter().map(|e| e.1.source_block_id()).collect(); + assert!( + !block_ids.is_empty(), + "Should have entries from at least one block" + ); + for id in block_ids { + assert_eq!(id, "CMT2"); + } + } else { + panic!("Failed to extract TIFF data from test file"); + } + } + + #[test] + fn test_lazy_loading_with_error() { + use crate::slice::SubsliceRange; + + let mut iter = MultiExifIter::new(DuplicateStrategy::IgnoreDuplicates); + + // Add a source that will fail to load + iter.add_tiff_source( + "failing_block".to_string(), + || Err("Simulated loading error".into()), + None, + ); + + // Add a successful source using real TIFF data + let tiff_buf = crate::testkit::read_sample("tif.tif").unwrap(); + let (tiff_data, _) = + super::super::extract_exif_with_mime(crate::file::MimeImage::Tiff, &tiff_buf, None) + .unwrap(); + + if let Some(exif_range) = tiff_data.and_then(|x| tiff_buf.subslice_in_range(x)) { + let exif_data: &[u8] = &tiff_buf[exif_range]; + iter.add_tiff_data( + "good_block".to_string(), + PartialVec::from(exif_data.to_vec()), + None, + ); + + let mut entries = Vec::new(); + for entry in &mut iter { + println!( + "Error test - Got entry: block={}, tag={:04x}", + entry.source_block_id(), + entry.tag_code() + ); + entries.push(entry); + } + + // Should only get entries from the successful block + assert!( + !entries.is_empty(), + "Should have at least some entries from the good block" + ); + for entry in &entries { + assert_eq!(entry.source_block_id(), "good_block"); + } + } + } +} diff --git a/src/file.rs b/src/file.rs index 5ee1fc8..787fe95 100644 --- a/src/file.rs +++ b/src/file.rs @@ -37,6 +37,8 @@ const MP4_BRAND_NAMES: &[&str] = &[ const QT_BRAND_NAMES: &[&str] = &["qt ", "mqt "]; +const CR3_BRAND_NAMES: &[&str] = &["crx "]; + #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub(crate) enum Mime { Image(MimeImage), @@ -65,6 +67,7 @@ pub(crate) enum MimeImage { Heif, Tiff, Raf, // Fujifilm RAW, image/x-fuji-raf + Cr3, // Canon RAW, image/x-canon-cr3 } #[derive(Debug, Clone, PartialEq, Eq, Copy)] @@ -219,6 +222,11 @@ fn parse_bmff_mime(input: &[u8]) -> crate::Result { return Ok(Mime::Video(MimeVideo::Mp4)); } + // Check if it is a CR3 file + if CR3_BRAND_NAMES.iter().any(|v| v.as_bytes() == major_brand) { + return Ok(Mime::Image(MimeImage::Cr3)); + } + // Check compatible brands let compatible_brands = ftyp.body_data(); diff --git a/src/heif.rs b/src/heif.rs index 016980f..dc26f34 100644 --- a/src/heif.rs +++ b/src/heif.rs @@ -1,13 +1,15 @@ use std::io::{Read, Seek}; use nom::combinator::fail; -use nom::{number::complete::be_u32, IResult}; +use nom::IResult; use crate::bbox::find_box; use crate::exif::Exif; use crate::{ bbox::{BoxHolder, MetaBox, ParseBox}, - exif::check_exif_header, + error::{nom_error_to_parsing_error_with_state, ParsingError, ParsingErrorState}, + exif::check_exif_header2, + parser::ParsingState, }; use crate::{ExifIter, MediaParser, MediaSource}; @@ -42,17 +44,45 @@ pub fn parse_heif_exif(reader: R) -> crate::Result> Ok(Some(iter.into())) } -/// Extract Exif TIFF data from the bytes of a HEIF/HEIC file. -#[allow(unused)] -#[tracing::instrument(skip_all)] -pub(crate) fn extract_exif_data(input: &[u8]) -> IResult<&[u8], Option<&[u8]>> { - let (remain, meta) = parse_meta_box(input)?; +pub(crate) fn extract_exif_data( + state: Option, + buf: &[u8], +) -> Result<(Option<&[u8]>, Option), ParsingErrorState> { + let (data, state) = match state { + Some(ParsingState::HeifExifSize(size)) => { + let (_, data) = nom::bytes::streaming::take(size)(buf) + .map_err(|e| nom_error_to_parsing_error_with_state(e, state.clone()))?; + (Some(data), state) + } + None => { + let (_, meta) = + parse_meta_box(buf).map_err(|e| nom_error_to_parsing_error_with_state(e, state))?; + + if let Some(meta) = meta { + if let Some(range) = meta.exif_data_offset() { + if range.end > buf.len() { + let state = ParsingState::HeifExifSize(range.len()); + let clear_and_skip = ParsingError::ClearAndSkip(range.start); + return Err(ParsingErrorState::new(clear_and_skip, Some(state))); + } else { + (Some(&buf[range]), None) + } + } else { + return Err(ParsingErrorState::new( + ParsingError::Failed("no exif offset in meta box".into()), + None, + )); + } + } else { + (None, None) + } + } + _ => unreachable!(), + }; - if let Some(meta) = meta { - extract_exif_with_meta(input, &meta) - } else { - Ok((remain, None)) - } + let data = data.and_then(|x| check_exif_header2(x).map(|x| x.0).ok()); + + Ok((data, state)) } pub(crate) fn parse_meta_box(input: &[u8]) -> IResult<&[u8], Option> { @@ -76,28 +106,6 @@ pub(crate) fn parse_meta_box(input: &[u8]) -> IResult<&[u8], Option> { Ok((remain, Some(bbox))) } -pub(crate) fn extract_exif_with_meta<'a>( - input: &'a [u8], - bbox: &MetaBox, -) -> IResult<&'a [u8], Option<&'a [u8]>> { - let (out_remain, data) = bbox.exif_data(input)?; - tracing::debug!( - data_len = data.as_ref().map(|x| x.len()), - "exif data extracted" - ); - - if let Some(data) = data { - let (remain, _) = be_u32(data)?; - if check_exif_header(remain)? { - Ok((out_remain, Some(&remain[6..]))) // Safe-slice - } else { - Ok((out_remain, None)) - } - } else { - Ok((out_remain, None)) - } -} - #[allow(deprecated)] #[cfg(test)] mod tests { @@ -134,7 +142,7 @@ mod tests { let _ = tracing_subscriber::fmt().with_test_writer().try_init(); let buf = read_sample(path).unwrap(); - let (_, exif) = extract_exif_data(&buf[..]).unwrap(); + let (exif, _state) = extract_exif_data(None, &buf[..]).unwrap(); if exif_size == 0 { assert!(exif.is_none()); diff --git a/src/lib.rs b/src/lib.rs index e991360..4887971 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -340,6 +340,7 @@ pub use mov::{parse_metadata, parse_mov_metadata}; mod bbox; mod buffer; +mod cr3; mod ebml; mod error; mod exif; diff --git a/src/parser.rs b/src/parser.rs index f7b6aaa..c357034 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -12,7 +12,7 @@ use std::{ use crate::{ buffer::Buffers, error::{ParsedError, ParsingError, ParsingErrorState}, - exif::{parse_exif_iter, TiffHeader}, + exif::{parse_exif_iter, parse_multi_exif_iter, MultiExifIter, TiffHeader}, file::Mime, partial_vec::PartialVec, skip::Skip, @@ -143,6 +143,7 @@ pub(crate) trait Buf { pub(crate) enum ParsingState { TiffHeader(TiffHeader), HeifExifSize(usize), + Cr3ExifSize(usize), } impl Display for ParsingState { @@ -150,6 +151,7 @@ impl Display for ParsingState { match self { ParsingState::TiffHeader(h) => Display::fmt(&format!("ParsingState: {h:?})"), f), ParsingState::HeifExifSize(n) => Display::fmt(&format!("ParsingState: {n}"), f), + ParsingState::Cr3ExifSize(n) => Display::fmt(&format!("ParsingState: {n}"), f), } } } @@ -323,6 +325,15 @@ impl> ParseOutput for ExifIter { } } +impl> ParseOutput for MultiExifIter { + fn parse(parser: &mut MediaParser, mut ms: MediaSource) -> crate::Result { + if !ms.has_exif() { + return Err(crate::Error::ParseFailed("no Exif data here".into())); + } + parse_multi_exif_iter::(parser, ms.mime.unwrap_image(), &mut ms.reader) + } +} + impl> ParseOutput for TrackInfo { fn parse(parser: &mut MediaParser, mut ms: MediaSource) -> crate::Result { if !ms.has_track() { diff --git a/testdata/canon-r6.cr3 b/testdata/canon-r6.cr3 new file mode 100644 index 0000000..93fe377 Binary files /dev/null and b/testdata/canon-r6.cr3 differ diff --git a/testdata/canon-r6.cr3.sorted.txt b/testdata/canon-r6.cr3.sorted.txt new file mode 100644 index 0000000..93e5bfa --- /dev/null +++ b/testdata/canon-r6.cr3.sorted.txt @@ -0,0 +1,10 @@ +Copyright » Copyright 2025 Roland Dreier - CC BY-SA +ImageHeight » 3648 +ImageWidth » 5472 +Make » Canon +Model » Canon EOS R6 +ModifyDate » 2025-09-06 20:24:47 +Orientation » 1 +ResolutionUnit » 2 +XResolution » 72/1 (72.0000) +YResolution » 72/1 (72.0000) \ No newline at end of file