From 45d923821de94697881941b45b500caecd81649a Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Wed, 24 Sep 2025 10:34:54 +0300 Subject: [PATCH 1/8] fix: double anchor --- foliant/preprocessors/includes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py index 2bcd18c..f536e9e 100644 --- a/foliant/preprocessors/includes.py +++ b/foliant/preprocessors/includes.py @@ -658,7 +658,6 @@ def _sub(m): self.logger.debug( f'An error {exception} occurred when resolving the link: {m.group("path")}' ) - link = m.group('path') return f'[{caption}]({link}{anchor})' From dcc77eb2c8c259af2ebb58e37d5770d2734047a9 Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Wed, 24 Sep 2025 10:44:03 +0300 Subject: [PATCH 2/8] update: version and changelog.md --- changelog.md | 4 ++++ setup.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 02e7031..ad16bd1 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,7 @@ +# 1.1.20 + +- Fix: bug where the `_adjust_links` duplicated the anchor. + # 1.1.19 - Add: anchor link parsing for the includes map. diff --git a/setup.py b/setup.py index 7c430a3..045dd20 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ description=SHORT_DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', - version='1.1.19', + version='1.1.20', author='Konstantin Molchanov', author_email='moigagoo@live.com', url='https://github.com/foliant-docs/foliantcontrib.includes', From 335e9034d148fc0b0a52543d90db89665d90bc56 Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Tue, 13 Jan 2026 09:25:53 +0300 Subject: [PATCH 3/8] add: for_includes_map parameter --- foliant/preprocessors/includes.py | 709 +++++++++++++++++------------- test/test_includes.py | 302 +++++++++++++ 2 files changed, 701 insertions(+), 310 deletions(-) diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py index f536e9e..e619bb8 100644 --- a/foliant/preprocessors/includes.py +++ b/foliant/preprocessors/includes.py @@ -1,6 +1,7 @@ import re import urllib.request import urllib.error +import urllib.parse from shutil import rmtree from io import StringIO from hashlib import md5 @@ -54,7 +55,7 @@ def __init__(self, *args, **kwargs): self.includes_map_anchors = False if 'includes_map' in self.options: self.includes_map_enable = True - if type(self.options['includes_map']) != bool and 'anchors' in self.options['includes_map']: + if isinstance(self.options['includes_map'], dict) and 'anchors' in self.options['includes_map']: self.includes_map_anchors = True if self.includes_map_enable: @@ -62,15 +63,15 @@ def __init__(self, *args, **kwargs): self.enable_clean_tokens = True self.chapters = [] - self.chapters_list(self.config["chapters"], self.chapters) # converting chapters to a list + self._chapters_list(self.config["chapters"], self.chapters) # converting chapters to a list self.logger = self.logger.getChild('includes') self.logger.debug(f'Preprocessor inited: {self.__dict__}') - def chapters_list(self, obj, chapters: list) -> list: + def _chapters_list(self, obj, chapters: list) -> None: '''Converting chapters to a list - :param config_chapters: Chapters from config + :param obj: Chapters from config :param chapters: List of chapters ''' if isinstance(obj, list): @@ -78,15 +79,15 @@ def chapters_list(self, obj, chapters: list) -> list: if isinstance(item, str): chapters.append(f"{self.src_dir}/{item}") else: - self.chapters_list(item, chapters) + self._chapters_list(item, chapters) elif isinstance(obj, Path): chapters.append(f"{self.src_dir}/{obj.as_posix()}") - elif isinstance(obj, object): + elif isinstance(obj, dict): for _, v in obj.items(): if isinstance(v, str): chapters.append(f"{self.src_dir}/{v}") else: - self.chapters_list(v, chapters) + self._chapters_list(v, chapters) def _find_file( self, @@ -96,7 +97,7 @@ def _find_file( '''Find a file in a directory by name. Check subdirectories recursively. :param file_name: Name of the file - :lookup_dir: Starting directory + :param lookup_dir: Starting directory :returns: Path to the found file or None if the file was not found :raises: FileNotFoundError @@ -111,21 +112,21 @@ def _find_file( result = item break - else: - raise FileNotFoundError(file_name) + if result is None: + raise FileNotFoundError(f"File not found: {file_name}") self.logger.debug(f'File found: {result}') return result - def create_full_link(self, repo_url: str, revision: str, path: str): + def create_full_link(self, repo_url: str, revision: str, path: str) -> str: + """Create full link to file in repository.""" if repo_url.endswith('.git'): repo_url = repo_url[:-4] if revision: full_repo_url = repo_url + '/tree/' + revision + '/' + path.rpartition('/')[0] - else: full_repo_url = repo_url + '/-/blob/master/' + path.rpartition('/')[0] @@ -153,8 +154,8 @@ def _download_file_from_url(self, url: str) -> Path: extra_suffix = '.inc' downloaded_file_path = ( - self._downloaded_dir_path / - f'{md5(url.encode()).hexdigest()[:8]}_{url_path.stem}{extra_stem}{url_path.suffix}{extra_suffix}' + self._downloaded_dir_path / + f'{md5(url.encode()).hexdigest()[:8]}_{url_path.stem}{extra_stem}{url_path.suffix}{extra_suffix}' ) self.logger.debug(f'Downloaded file path: {downloaded_file_path}') @@ -165,13 +166,18 @@ def _download_file_from_url(self, url: str) -> Path: response = urllib.request.urlopen(url, timeout=2) except (urllib.error.HTTPError, urllib.error.URLError) as error: self.logger.error(f'Data is not retrieved with {error}\nURL: {url}') + raise except socket.timeout: self.logger.error(f'socket timed out - URL {url}') + raise else: charset = 'utf-8' - if response.headers['Content-Type']: - charset_match = re.search(r'(^|[\s;])charset=(?P[^\s;]+)', response.headers['Content-Type']) + if response.headers.get('Content-Type'): + charset_match = re.search( + r'(^|[\s;])charset=(?P[^\s;]+)', + response.headers['Content-Type'] + ) if charset_match: charset = charset_match.group('charset') @@ -182,7 +188,7 @@ def _download_file_from_url(self, url: str) -> Path: self._downloaded_dir_path.mkdir(parents=True, exist_ok=True) - # The beginning of the block codes for converting relative paths to links + # Convert relative paths to absolute links in downloaded content dict_new_link = {} regexp_find_link = re.compile(r'\[.+?\]\(.+?\)') regexp_find_path = re.compile(r'\(.+?\)') @@ -196,13 +202,15 @@ def _download_file_from_url(self, url: str) -> Path: else: relative_path = regexp_find_path.findall(line) sub_relative_path = re.findall(r'\[.+?\]', line) - dict_new_link[line] = sub_relative_path[0] + '(' + url.rpartition('/')[0].replace('raw', - 'blob') + '/' + \ - relative_path[0].partition('(')[2] + if relative_path and sub_relative_path: + dict_new_link[line] = ( + sub_relative_path[0] + '(' + + url.rpartition('/')[0].replace('raw', 'blob') + '/' + + relative_path[0].partition('(')[2] + ) for line in dict_new_link: downloaded_content = downloaded_content.replace(line, dict_new_link[line]) - # End of the conversion code block with open(downloaded_file_path, 'w', encoding='utf8') as downloaded_file: downloaded_file.write(downloaded_content) @@ -230,20 +238,18 @@ def _sync_repo( self.logger.debug(f'Synchronizing with repo; URL: {repo_url}, revision: {revision}') try: - self.logger.debug(f'Cloning repo {repo_url} to {repo_path}') - - run( - f'git clone {repo_url} {repo_path}', - shell=True, - check=True, - stdout=PIPE, - stderr=STDOUT - ) - - except CalledProcessError as exception: - if repo_path.exists(): - self.logger.debug('Repo already cloned; pulling from remote') - + if not repo_path.exists(): + self.logger.debug(f'Cloning repo {repo_url} to {repo_path}') + + run( + f'git clone {repo_url} {repo_path}', + shell=True, + check=True, + stdout=PIPE, + stderr=STDOUT + ) + else: + self.logger.debug('Repo already exists; pulling from remote') try: run( 'git pull', @@ -253,24 +259,25 @@ def _sync_repo( stdout=PIPE, stderr=STDOUT ) - except CalledProcessError as exception: - self.logger.warning(str(exception)) - except Exception as exception: - self.logger.warning(str(exception)) + self.logger.warning(f"Git pull failed: {exception}") - else: - self.logger.error(str(exception)) + except CalledProcessError as exception: + self.logger.error(f"Git operation failed: {exception}") + raise if revision: - run( - f'git checkout {revision}', - cwd=repo_path, - shell=True, - check=True, - stdout=PIPE, - stderr=STDOUT - ) + try: + run( + f'git checkout {revision}', + cwd=repo_path, + shell=True, + check=True, + stdout=PIPE, + stderr=STDOUT + ) + except CalledProcessError as exception: + self.logger.warning(f"Failed to checkout revision {revision}: {exception}") return repo_path @@ -295,12 +302,10 @@ def _sub(heading): f'Shift heading level to {new_heading_level}, heading content: {heading.group("content")}' ) - if new_heading_level <= 6: + if new_heading_level <= 6 and new_heading_level >= 1: return f'{"#" * new_heading_level} {heading.group("content")}{heading.group("tail")}' - else: self.logger.debug('New heading level is out of range, using bold paragraph text instead of heading') - return f'**{heading.group("content")}**{heading.group("tail")}' return self._heading_pattern.sub(_sub, content) @@ -325,7 +330,7 @@ def _find_top_heading_level( if heading_level < result: result = heading_level - self.logger.debug(f'Maximum heading level: {result}') + self.logger.debug(f'Maximum heading level: {result}') return result if result < float('inf') else 0 @@ -382,6 +387,8 @@ def _cut_from_position_to_position( ) # First, cut the content from the starting position to the end + from_heading_line = None + from_heading_level = None if from_id: self.logger.debug('Starting point is defined by its ID') @@ -392,40 +399,44 @@ def _cut_from_position_to_position( ) from_anchor_pattern = re.compile( - rf'(?:(?]*))?\>{re.escape(from_id)}<\/anchor\>' + rf'(?:(?]*))?\>{re.escape(from_id)}<\/anchor\>', + flags=re.MULTILINE ) - if from_identified_heading_pattern.findall(content): + if from_identified_heading_pattern.search(content): self.logger.debug('Starting heading with defined ID is found') - - result = from_identified_heading_pattern.split(content)[1] - - from_heading_line = from_identified_heading_pattern.findall(content)[0] - from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) - - self.logger.debug(f'Level of starting heading: {from_heading_level}') - - elif from_anchor_pattern.findall(content): + parts = from_identified_heading_pattern.split(content, maxsplit=1) + if len(parts) > 1: + result = parts[1] + from_heading_line = from_identified_heading_pattern.search(content).group(0) + from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) + else: + result = '' + elif from_anchor_pattern.search(content): self.logger.debug('Starting anchor with defined ID is found') - - result = from_anchor_pattern.split(content)[1] - - previous_content = from_anchor_pattern.split(content)[0] - - from_heading_line = None - from_heading_level = None - - for previous_heading_match in self._heading_pattern.finditer(previous_content): - from_heading_level = len(previous_heading_match.group('hashes')) - - self.logger.debug(f'Level of starting heading: {from_heading_level}') - + parts = from_anchor_pattern.split(content, maxsplit=1) + if len(parts) > 1: + result = parts[1] + previous_content = parts[0] + + # Find the last heading before the anchor + last_heading_match = None + for heading_match in self._heading_pattern.finditer(previous_content): + last_heading_match = heading_match + + if last_heading_match: + from_heading_level = len(last_heading_match.group('hashes')) + self.logger.debug(f'Level of previous heading: {from_heading_level}') + else: + from_heading_level = self._find_top_heading_level(result) + self.logger.debug(f'No previous heading found, top level from result: {from_heading_level}') + else: + result = '' else: self.logger.debug( 'Neither starting heading nor starting anchor is found, ' 'skipping the included content' ) - return '' elif from_heading: @@ -436,46 +447,38 @@ def _cut_from_position_to_position( flags=re.MULTILINE ) - if from_heading_pattern.findall(content): + if from_heading_pattern.search(content): self.logger.debug('Starting heading with defined content is found') - - result = from_heading_pattern.split(content)[1] - - from_heading_line = from_heading_pattern.findall(content)[0] - from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) - - self.logger.debug(f'Level of starting heading: {from_heading_level}') - + parts = from_heading_pattern.split(content, maxsplit=1) + if len(parts) > 1: + result = parts[1] + from_heading_line = from_heading_pattern.search(content).group(0) + from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) + else: + result = '' else: self.logger.debug('Starting heading is not found, skipping the included content') - return '' else: self.logger.debug('Starting point is not defined') content_buffer = StringIO(content) - first_line = content_buffer.readline() if self._heading_pattern.fullmatch(first_line): self.logger.debug('The content starts with heading') - result = content_buffer.read() from_heading_line = first_line from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) - else: self.logger.debug('The content does not start with heading') - result = content - from_heading_line = None from_heading_level = self._find_top_heading_level(content) self.logger.debug(f'Topmost heading level: {from_heading_level}') # After that, cut the result to the ending position - if to_end: self.logger.debug('Ending point is defined as the end of the document') @@ -488,19 +491,18 @@ def _cut_from_position_to_position( ) to_anchor_pattern = re.compile( - rf'(?:(?]*))?\>{re.escape(to_id)}<\/anchor\>' + rf'(?:(?]*))?\>{re.escape(to_id)}<\/anchor\>', + flags=re.MULTILINE ) - if to_identified_heading_pattern.findall(result): + if to_identified_heading_pattern.search(result): self.logger.debug('Ending heading with defined ID is found') - - result = to_identified_heading_pattern.split(result)[0] - - elif to_anchor_pattern.findall(result): + parts = to_identified_heading_pattern.split(result, maxsplit=1) + result = parts[0] if parts else '' + elif to_anchor_pattern.search(result): self.logger.debug('Ending anchor with defined ID is found') - - result = to_anchor_pattern.split(result)[0] - + parts = to_anchor_pattern.split(result, maxsplit=1) + result = parts[0] if parts else '' else: self.logger.debug('Neither ending heading nor ending anchor is found, cutting to the end') @@ -512,11 +514,10 @@ def _cut_from_position_to_position( flags=re.MULTILINE ) - if to_heading_pattern.findall(result): + if to_heading_pattern.search(result): self.logger.debug('Ending heading with defined content is found') - - result = to_heading_pattern.split(result)[0] - + parts = to_heading_pattern.split(result, maxsplit=1) + result = parts[0] if parts else '' else: self.logger.debug('Ending heading is not found, cutting to the end') @@ -528,38 +529,32 @@ def _cut_from_position_to_position( 'Since starting point is defined, cutting to the next heading of the same level' ) - to_heading_pattern = re.compile( - rf'^\#{{1,{from_heading_level}}}\s+\S+.*$', - flags=re.MULTILINE - ) - - result = to_heading_pattern.split(result)[0] - + if from_heading_level: + to_heading_pattern = re.compile( + rf'^\#{{1,{from_heading_level}}}\s+\S+.*$', + flags=re.MULTILINE + ) + parts = to_heading_pattern.split(result, maxsplit=1) + result = parts[0] if parts else '' else: self.logger.debug( 'Since starting point is not defined, using the whole included content' ) # Finally, take into account the options nohead and sethead - if not nohead and from_heading_line: self.logger.debug( 'Since nohead option is not specified, and the included content starts with heading, ' + 'including starting heading into the output' ) - result = from_heading_line + result - if sethead: + if sethead and from_heading_level: if sethead > 0: self.logger.debug( 'Since sethead option is specified, shifting headings levels in the included content' ) - - result = self._shift_headings( - result, - sethead - from_heading_level - ) + result = self._shift_headings(result, sethead - from_heading_level) return result @@ -601,63 +596,76 @@ def _adjust_links( :param content: Markdown content :param markdown_file_path: Path to the Markdown file containing the content + :param origin_file_path: Path to the original file where the include tag is located :returns: Markdown content with relative internal link paths ''' - def _resolve_link(link, root_path, depth_origin): + + def _resolve_link(link: str, root_path: Path, depth_origin: int) -> str: try: resolved_link = (markdown_file_path.absolute().parent / Path(link)).resolve() resolved_link = resolved_link.relative_to(root_path) resolved_link = '../' * depth_origin + resolved_link.as_posix() return resolved_link except Exception as exception: - self.logger.debug( - f'An error {exception} occurred when resolving the link: {link}' - ) + self.logger.debug(f'An error {exception} occurred when resolving the link: {link}') + return link def _sub(m): caption = m.group('text') link = m.group('path') anchor = '' + + # Split link and anchor link_array = m.group('path').split('#') if len(link_array) > 1: link = link_array[0] anchor = f'#{link_array[1]}' + root_path = self.project_path.absolute() / self.tmp_dir - if not Path(link).is_absolute(): - extension = Path(link).suffix - try: - origin_rel = origin_file_path.relative_to(root_path) - depth_origin = len(origin_rel.parts) - depth_markdown_file = len(markdown_file_path.relative_to(root_path).parts) - depth_difference = depth_origin - depth_markdown_file - if extension == ".md": - link = _resolve_link(link, root_path, depth_origin - 1) - elif extension == "": - if depth_origin >= depth_markdown_file: - link = '../' * depth_difference + link - else: - link_split = link.split('/') - if link_split[0] == '..': - if link_split[-1] == '': - link_split = link_split[:-1] - link_split = link_split[1:] - link = f"{'/'.join(link_split)}.md" - link = _resolve_link(link, root_path, depth_origin) - if ( - depth_difference == 0 - ) and ( - Path(Path(link).name).with_suffix('').as_posix() == Path(origin_rel.name).with_suffix('').as_posix() - ): - link = '' - self.logger.debug( - f'Updating link reference; user specified path: {m.group("path")}, ' + - f'absolute path: {link}' - ) - except Exception as exception: - self.logger.debug( - f'An error {exception} occurred when resolving the link: {m.group("path")}' - ) + + # Skip absolute paths and external URLs + if Path(link).is_absolute() or link.startswith(('http://', 'https://', 'ftp://')): + return f'[{caption}]({link}{anchor})' + + extension = Path(link).suffix + + try: + origin_rel = origin_file_path.relative_to(root_path) + depth_origin = len(origin_rel.parts) + depth_markdown_file = len(markdown_file_path.relative_to(root_path).parts) + depth_difference = depth_origin - depth_markdown_file + + if extension == ".md": + link = _resolve_link(link, root_path, depth_origin - 1) + elif extension == "": + if depth_origin >= depth_markdown_file: + link = '../' * depth_difference + link + else: + link_split = link.split('/') + if link_split and link_split[0] == '..': + if link_split[-1] == '': + link_split = link_split[:-1] + link_split = link_split[1:] + link = f"{'/'.join(link_split)}.md" + link = _resolve_link(link, root_path, depth_origin) + + # Check if link points to the same file (without anchor) + if (depth_difference == 0 and + Path(Path(link).name).with_suffix('').as_posix() == + Path(origin_rel.name).with_suffix('').as_posix()): + link = '' + + self.logger.debug( + f'Updating link reference; user specified path: {m.group("path")}, ' + + f'resolved path: {link}' + ) + + except Exception as exception: + self.logger.debug( + f'An error {exception} occurred when resolving the link: {m.group("path")}' + ) + link = m.group('path') return f'[{caption}]({link}{anchor})' @@ -741,9 +749,9 @@ def _get_src_file_path( ) path_mapped_to_src_dir = ( - self.project_path.resolve() / - self.config['src_dir'] / - path_relative_to_working_dir + self.project_path.resolve() / + self.config['src_dir'] / + path_relative_to_working_dir ) self.logger.debug( @@ -771,14 +779,14 @@ def _get_included_file_path( self.logger.debug(f'Currently processed Markdown file: {current_processed_file_path}') - included_file_path = (current_processed_file_path.parent / user_specified_path).resolve() + included_file_path = (current_processed_file_path.parent / Path(user_specified_path)).resolve() self.logger.debug(f'User-specified included file path: {included_file_path}') if ( - self.working_dir.resolve() in current_processed_file_path.parents - and - self.working_dir.resolve() not in included_file_path.parents + self.working_dir.resolve() in current_processed_file_path.parents + and + self.working_dir.resolve() not in included_file_path.parents ): self.logger.debug( 'Currently processed file is located inside the working dir, ' + @@ -788,18 +796,66 @@ def _get_included_file_path( ) included_file_path = ( - self._get_src_file_path(current_processed_file_path).parent / user_specified_path + self._get_src_file_path(current_processed_file_path).parent / Path(user_specified_path) ).resolve() else: - self.logger.debug( - 'Using these paths without changes' - ) + self.logger.debug('Using these paths without changes') self.logger.debug(f'Finally, included file path: {included_file_path}') return included_file_path + def _read_source_file_content( + self, + file_path: Path + ) -> str: + '''Read content from source file, handling both temporary and source directory paths. + + :param file_path: Path to the file to read + + :returns: File content as string + ''' + + self.logger.debug(f'Reading source file: {file_path}') + + # If the file is located in a temporary directory, let's try to find the corresponding source file + if self.working_dir.resolve() in file_path.parents: + # This is a file in a temporary directory + try: + # Get the path to the source file + src_file_path = self._get_src_file_path(file_path) + self.logger.debug(f'Mapping temporary file to source file: {src_file_path}') + + if src_file_path.exists(): + with open(src_file_path, encoding='utf8') as src_file: + return src_file.read() + else: + # If the source file is not found, we read from the temporary file + self.logger.debug('Source file not found, reading from temporary file') + if file_path.exists(): + with open(file_path, encoding='utf8') as temp_file: + return temp_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + except Exception as e: + self.logger.debug(f'Error mapping to source file: {e}, reading from temporary file') + if file_path.exists(): + with open(file_path, encoding='utf8') as temp_file: + return temp_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + else: + # The file is not in the temporary directory, we read it directly + if file_path.exists(): + with open(file_path, encoding='utf8') as src_file: + return src_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + def _process_include( self, included_file_path: Path, @@ -812,14 +868,15 @@ def _process_include( sethead: int or None = None, nohead: bool = False, include_link: str or None = None, - origin_file_path: Path = None + origin_file_path: Path = None, + for_includes_map: bool = False ) -> (str, list): '''Replace a local include statement with the file content. Necessary adjustments are applied to the content: cut between certain headings, strip the top heading, set heading level. :param included_file_path: Path to the included file - :param project_root_path: Path to the “root” directory of Foliant project + :param project_root_path: Path to the "root" directory of Foliant project that the currently processed Markdown file belongs to :param from_heading: Include starting from this heading :param to_heading: Include up to this heading (not including the heading itself) @@ -829,81 +886,93 @@ def _process_include( :param to_end: Flag that tells to cut to the end of document :param sethead: Level of the topmost heading in the included content :param nohead: Flag that tells to strip the starting heading from the included content + :param include_link: Link to the included file for URL includes + :param origin_file_path: Path to the original file where include tag is located + :param for_includes_map: Flag indicating this is for includes_map generation only - :returns: Included file content + :returns: Tuple of (included file content, list of anchors) ''' self.logger.debug( f'Included file path: {included_file_path}, from heading: {from_heading}, ' + - f'to heading: {to_heading}, sethead: {sethead}, nohead: {nohead}' + f'to heading: {to_heading}, sethead: {sethead}, nohead: {nohead}, ' + + f'for_includes_map: {for_includes_map}' ) anchors = [] - if not included_file_path.exists(): - if self.options['allow_failure']: - self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') - - path_error_link = Path(self.project_path/'.error_link').resolve() - - if not Path(path_error_link).exists(): - Path(path_error_link).mkdir() + # To generate includes_map, we read files directly from the source directory + if for_includes_map: + included_content = self._read_source_file_content(included_file_path) + else: + if not included_file_path.exists(): + if self.options['allow_failure']: + self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') - path_error_file = open(path_error_link/included_file_path.name, 'w+', encoding='utf8') + path_error_link = Path(self.project_path / '.error_link').resolve() - if self.options['stub_text']: - path_error_file.write(f'The url or repo_url link is not correct, file not found: {included_file_path}') - path_error_file.close() + if not path_error_link.exists(): + path_error_link.mkdir(parents=True) - included_file_path = path_error_link/included_file_path.name - else: - self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') - return '', anchors + path_error_file = path_error_link / included_file_path.name + with open(path_error_file, 'w+', encoding='utf8') as f: + if self.options['stub_text']: + f.write(f'The url or repo_url link is not correct, file not found: {included_file_path}') - with open(included_file_path, encoding='utf8') as included_file: - included_content = included_file.read() + included_file_path = path_error_file + else: + self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') + return '', anchors - # The beginning of the block codes for converting relative paths to links - if include_link: - dict_new_link = {} - regexp_find_link = re.compile(r'\[.+?\]\(.+?\)') - regexp_find_path = re.compile(r'\(.+?\)') + with open(included_file_path, encoding='utf8') as included_file: + included_content = included_file.read() - old_found_link = regexp_find_link.findall(included_content) + # Convert relative paths to absolute links for URL includes + if include_link and not for_includes_map: + dict_new_link = {} + regexp_find_link = re.compile(r'\[.+?\]\(.+?\)') + regexp_find_path = re.compile(r'\(.+?\)') - for line in old_found_link: - relative_path = regexp_find_path.findall(line) + old_found_link = regexp_find_link.findall(included_content) - for ex_line in relative_path: - exceptions_characters = re.findall(r'https?://[^\s]+|@|:|\.png|\.jpeg|.svg', ex_line) - if exceptions_characters: - continue - else: - sub_relative_path = re.findall(r'\[.+?\]', line) - dict_new_link[line] = sub_relative_path[0] + '(' + include_link.rpartition('/')[0].replace( - 'raw', 'blob') + '/' + relative_path[0].partition('(')[2] + for line in old_found_link: + relative_path = regexp_find_path.findall(line) - for line in dict_new_link: - included_content = included_content.replace(line, dict_new_link[line]) - # End of the conversion code block + for ex_line in relative_path: + exceptions_characters = re.findall(r'https?://[^\s]+|@|:|\.png|\.jpeg|\.svg', ex_line) + if exceptions_characters: + continue + else: + sub_relative_path = re.findall(r'\[.+?\]', line) + if sub_relative_path and relative_path: + dict_new_link[line] = ( + sub_relative_path[0] + '(' + + include_link.rpartition('/')[0].replace('raw', 'blob') + '/' + + relative_path[0].partition('(')[2] + ) - # Removing metadata from content before including - included_content = remove_meta(included_content) - included_content = self._cut_from_position_to_position( - included_content, - from_heading, - to_heading, - from_id, - to_id, - to_end, - sethead, - nohead - ) + for line in dict_new_link: + included_content = included_content.replace(line, dict_new_link[line]) + + # Removing metadata from content before including + included_content = remove_meta(included_content) + included_content = self._cut_from_position_to_position( + included_content, + from_heading, + to_heading, + from_id, + to_id, + to_end, + sethead, + nohead + ) - # Find anchors - if self.includes_map_anchors: - anchors = self._add_anchors(anchors, included_content) + # Find anchors + if self.includes_map_anchors: + anchors = self._add_anchors(anchors, included_content) + # We do not apply additional processing for includes_map + if not for_includes_map: if self.config.get('escape_code', False): if isinstance(self.config['escape_code'], dict): escapecode_options = self.config['escape_code'].get('options', {}) @@ -924,7 +993,8 @@ def _process_include( ).escape(included_content) included_content = self._adjust_image_paths(included_content, included_file_path) - included_content = self._adjust_links(included_content, included_file_path, origin_file_path) + if origin_file_path: + included_content = self._adjust_links(included_content, included_file_path, origin_file_path) if project_root_path: included_content = self._adjust_paths_in_tags_attributes( @@ -957,30 +1027,31 @@ def _find_anchors(self, content: str) -> list: anchors_list = [] anchors = re.findall(r'\([\-\_A-Za-z0-9]+)\<\/anchor\>', content) - for anchor in anchors: - anchors_list.append(anchor) + anchors_list.extend(anchors) + custom_ids = re.findall(r'\{\#([\-\_A-Za-z0-9]+)\}', content) - for anchor in custom_ids: - anchors_list.append(anchor) + anchors_list.extend(custom_ids) + elements_with_ids = re.findall(r'id\=[\"\']([\-\_A-Za-z0-9]+)[\"\']', content) - for anchor in elements_with_ids: - anchors_list.append(anchor) + anchors_list.extend(elements_with_ids) + return anchors_list - def _add_anchors(self, l: list, content: str) -> list: + def _add_anchors(self, anchor_list: list, content: str) -> list: """Add an anchor link to the list of anchor links - :param l: The original list + :param anchor_list: The original list :param content: Markdown content :returns: A list with added anchors """ anchors = self._find_anchors(content) if anchors: - l.extend(anchors) - return l + anchor_list.extend(anchors) + return anchor_list def clean_tokens(self, url: str) -> str: + """Remove authentication tokens from URLs.""" token_pattern = r"(https*://)(.*)@(.*)" s = url if self.enable_clean_tokens: @@ -989,6 +1060,7 @@ def clean_tokens(self, url: str) -> str: return s def _prepare_path_for_includes_map(self, path: Path) -> str: + """Prepare path for includes map.""" donor_path = None if path.as_posix().startswith(self.working_dir.as_posix()): _path = path.relative_to(self.working_dir) @@ -997,15 +1069,13 @@ def _prepare_path_for_includes_map(self, path: Path) -> str: _path = path.relative_to(getcwd()) if _path.as_posix().startswith(self.working_dir.as_posix()): _path = _path.relative_to(self.working_dir) - if _path.as_posix().startswith(self.working_dir.as_posix()): - donor_path = f"{self.src_dir}/{_path.relative_to(self.working_dir).as_posix()}" - else: - donor_path = f"{self.src_dir}/{_path.as_posix()}" + donor_path = f"{self.src_dir}/{_path.as_posix()}" else: donor_path = _path.as_posix() return donor_path def _exist_in_includes_map(self, includes_map: list, path: str) -> bool: + """Check if path exists in includes map.""" for obj in includes_map: if obj["file"] == path: return True @@ -1016,7 +1086,8 @@ def process_includes( markdown_file_path: Path, content: str, project_root_path: Path or None = None, - sethead: int or None = None + sethead: int or None = None, + for_includes_map: bool = False ) -> str: '''Replace all include statements with the respective file contents. @@ -1026,6 +1097,7 @@ def process_includes( that the currently processed Markdown file belongs to :param sethead: Level of the topmost heading in the content, it may be set when the method is called recursively + :param for_includes_map: Flag indicating this is for includes_map generation only :returns: Markdown content with resolved includes ''' @@ -1038,7 +1110,7 @@ def process_includes( markdown_file_path = markdown_file_path.resolve() - self.logger.debug(f'Processing Markdown file: {markdown_file_path}') + self.logger.debug(f'Processing Markdown file: {markdown_file_path}, for_includes_map: {for_includes_map}') processed_content = '' @@ -1053,9 +1125,8 @@ def process_includes( include_statement = self.pattern.fullmatch(content_part) if include_statement: - if self.includes_map_enable: - donor_md_path = None - donor_anchors = [] + donor_md_path = None + donor_anchors = [] current_project_root_path = project_root_path @@ -1077,7 +1148,6 @@ def process_includes( if options.get('sethead'): if current_sethead: current_sethead += options['sethead'] - 1 - else: current_sethead = options['sethead'] @@ -1105,7 +1175,7 @@ def process_includes( > """ - if body: + if body and body.group('path'): self.logger.debug('Using the legacy syntax rules') if body.group('repo'): @@ -1120,7 +1190,6 @@ def process_includes( if '#' in repo_from_alias: repo_url, revision = repo_from_alias.split('#', maxsplit=1) - else: repo_url = repo_from_alias @@ -1129,16 +1198,20 @@ def process_includes( if body.group('revision'): revision = body.group('revision') - self.logger.debug( f'Highest priority revision specified in the include statement: {revision}' ) self.logger.debug(f'Repo URL: {repo_url}, revision: {revision}') - repo_path = self._sync_repo(repo_url, revision) - - self.logger.debug(f'Local path of the repo: {repo_path}') + # We are not synchronizing the repository for includes_map + if not for_includes_map: + repo_path = self._sync_repo(repo_url, revision) + self.logger.debug(f'Local path of the repo: {repo_path}') + else: + # Creating a dummy path for includes_map + repo_path = Path('/dummy/repo/path') + self.logger.debug('Skipping repo sync for includes_map generation') included_file_path = repo_path / body.group('path') @@ -1146,15 +1219,18 @@ def process_includes( donor_md_path = included_file_path.as_posix() donor_md_path = self.clean_tokens(donor_md_path) self.logger.debug(f'Set the repo URL of the included file to {recipient_md_path}: {donor_md_path} (1)') + if included_file_path.name.startswith('^'): - included_file_path = self._find_file( - included_file_path.name[1:], included_file_path.parent - ) + # For includes_map, we don't search for files, just use the path + if not for_includes_map: + included_file_path = self._find_file( + included_file_path.name[1:], included_file_path.parent + ) self.logger.debug(f'Resolved path to the included file: {included_file_path}') current_project_root_path = ( - repo_path / options.get('project_root', '') + repo_path / options.get('project_root', '') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1166,7 +1242,8 @@ def process_includes( to_heading=body.group('to_heading'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path + origin_file_path=markdown_file_path, + for_includes_map=for_includes_map ) if self.includes_map_enable and self.includes_map_anchors: @@ -1178,15 +1255,17 @@ def process_includes( included_file_path = self._get_included_file_path(body.group('path'), markdown_file_path) if included_file_path.name.startswith('^'): - included_file_path = self._find_file( - included_file_path.name[1:], included_file_path.parent - ) + # For includes_map, we don't search for files, just use the path + if not for_includes_map: + included_file_path = self._find_file( + included_file_path.name[1:], included_file_path.parent + ) self.logger.debug(f'Resolved path to the included file: {included_file_path}') if options.get('project_root'): current_project_root_path = ( - markdown_file_path.parent / options.get('project_root') + markdown_file_path.parent / options.get('project_root') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1198,7 +1277,8 @@ def process_includes( to_heading=body.group('to_heading'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path + origin_file_path=markdown_file_path, + for_includes_map=for_includes_map ) if self.includes_map_enable: @@ -1209,26 +1289,33 @@ def process_includes( if self.includes_map_enable and self.includes_map_anchors: donor_anchors = donor_anchors + anchors - else: # if body is missing + else: # if body is missing or empty self.logger.debug('Using the new syntax rules') if options.get('repo_url') and options.get('path'): self.logger.debug('File in Git repository referenced') - repo_path = self._sync_repo(options.get('repo_url'), options.get('revision')) - - self.logger.debug(f'Local path of the repo: {repo_path}') + # We are not synchronizing the repository for includes_map + if not for_includes_map: + repo_path = self._sync_repo(options.get('repo_url'), options.get('revision')) + self.logger.debug(f'Local path of the repo: {repo_path}') + else: + # Creating a dummy path for includes_map + repo_path = Path('/dummy/repo/path') + self.logger.debug('Skipping repo sync for includes_map generation') included_file_path = repo_path / options['path'] - self.logger.debug(f'Resolved path to the included file: {included_file_path}') current_project_root_path = ( - repo_path / options.get('project_root', '') + repo_path / options.get('project_root', '') ).resolve() - include_link = self.create_full_link(options.get('repo_url'), options.get('revision'), - options.get('path')) + include_link = self.create_full_link( + options.get('repo_url'), + options.get('revision'), + options.get('path') + ) self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1243,7 +1330,8 @@ def process_includes( sethead=current_sethead, nohead=options.get('nohead'), include_link=include_link, - origin_file_path=markdown_file_path + origin_file_path=markdown_file_path, + for_includes_map=for_includes_map ) if self.includes_map_enable: @@ -1257,13 +1345,18 @@ def process_includes( elif options.get('url'): self.logger.debug('File to get by URL referenced') - included_file_path = self._download_file_from_url(options['url']) - - self.logger.debug(f'Resolved path to the included file: {included_file_path}') + # We don't download files for includes_map + if not for_includes_map: + included_file_path = self._download_file_from_url(options['url']) + self.logger.debug(f'Resolved path to the included file: {included_file_path}') + else: + # Creating a dummy path for includes_map + included_file_path = Path('/dummy/url/file') + self.logger.debug('Skipping URL download for includes_map generation') if options.get('project_root'): current_project_root_path = ( - markdown_file_path.parent / options.get('project_root') + markdown_file_path.parent / options.get('project_root') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1278,7 +1371,8 @@ def process_includes( to_end=options.get('to_end'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path + origin_file_path=markdown_file_path, + for_includes_map=for_includes_map ) if self.includes_map_enable: @@ -1301,7 +1395,7 @@ def process_includes( if options.get('project_root'): current_project_root_path = ( - markdown_file_path.parent / options.get('project_root') + markdown_file_path.parent / options.get('project_root') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1316,7 +1410,8 @@ def process_includes( to_end=options.get('to_end'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path + origin_file_path=markdown_file_path, + for_includes_map=for_includes_map ) if self.includes_map_enable: @@ -1330,17 +1425,17 @@ def process_includes( self.logger.warning( 'Neither repo_url+path nor src specified, ignoring the include statement' ) - processed_content_part = '' - if self.options['recursive'] and self.pattern.search(processed_content_part): + if self.options['recursive'] and self.pattern.search(processed_content_part) and not for_includes_map: self.logger.debug('Recursive call of include statements processing') processed_content_part = self.process_includes( included_file_path, processed_content_part, current_project_root_path, - current_sethead + current_sethead, + for_includes_map ) wrap_code = options.get('wrap_code', '') @@ -1349,19 +1444,15 @@ def process_includes( wrapper = '' if wrap_code == 'triple_backticks': self.logger.debug('Wrapping included content as fence code block with triple backticks') - wrapper = '```' - elif wrap_code == 'triple_tildas': self.logger.debug('Wrapping included content as fence code block with triple tildas') - wrapper = '~~~' code_language = options.get('code_language', '') if code_language: self.logger.debug(f'Specifying code language: {code_language}') - else: self.logger.debug('Do not specify code language') @@ -1369,40 +1460,38 @@ def process_includes( processed_content_part += '\n' processed_content_part = ( - f'{wrapper}{code_language}' + '\n' + processed_content_part + wrapper + '\n' + f'{wrapper}{code_language}\n{processed_content_part}{wrapper}\n' ) elif wrap_code == 'single_backticks': self.logger.debug('Wrapping included content as inline code with single backticks') - processed_content_part = '`' + processed_content_part + '`' - if options.get('inline'): + if options.get('inline') and not for_includes_map: self.logger.debug( 'Processing included content part as inline, multiple lines will be stretched into one' ) - processed_content_part = re.sub(r'\s+', ' ', processed_content_part).strip() - if self.includes_map_enable: - if donor_md_path: - if recipient_md_path in self.chapters or "index.md" in recipient_md_path: - if not self._exist_in_includes_map(self.includes_map, recipient_md_path): - if not self.includes_map_anchors or len(donor_anchors) == 0: - self.includes_map.append({'file': recipient_md_path, "includes": []}) - else: - self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) - - for i, f in enumerate(self.includes_map): - if f['file'] == recipient_md_path: + if self.includes_map_enable and donor_md_path: + if recipient_md_path in self.chapters or "index.md" in recipient_md_path: + if not self._exist_in_includes_map(self.includes_map, recipient_md_path): + if not self.includes_map_anchors or len(donor_anchors) == 0: + self.includes_map.append({'file': recipient_md_path, "includes": []}) + else: + self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) + + for i, f in enumerate(self.includes_map): + if f['file'] == recipient_md_path: + if donor_md_path not in self.includes_map[i]['includes']: self.includes_map[i]['includes'].append(donor_md_path) - if self.includes_map_anchors: - if 'anchors' not in self.includes_map[i]: - self.includes_map[i]['anchors'] = [] - for anchor in donor_anchors: - if anchor not in self.includes_map[i]['anchors']: - self.includes_map[i]['anchors'].append(anchor) + if self.includes_map_anchors: + if 'anchors' not in self.includes_map[i]: + self.includes_map[i]['anchors'] = [] + for anchor in donor_anchors: + if anchor not in self.includes_map[i]['anchors']: + self.includes_map[i]['anchors'].append(anchor) else: processed_content_part = content_part @@ -1467,8 +1556,8 @@ def apply(self): if self.includes_map_enable: output = f'{self.working_dir}/static/includes_map.json' Path(f'{self.working_dir}/static/').mkdir(parents=True, exist_ok=True) - with open(f'{self.working_dir}/static/includes_map.json', 'w', encoding='utf8') as f: + with open(output, 'w', encoding='utf8') as f: dump(self.includes_map, f) - self.logger.debug(f'includes_map write to {output}') + self.logger.debug(f'includes_map written to {output}') self.logger.info('Preprocessor applied') diff --git a/test/test_includes.py b/test/test_includes.py index c52d8c7..676f058 100644 --- a/test/test_includes.py +++ b/test/test_includes.py @@ -350,3 +350,305 @@ def test_adjust_links_three(self): input_mapping=input_map, expected_mapping=expected_map, ) + + def test_includes_map_with_not_build_file(self): + '''Test includes_map generation for files with not_build: true parameter.''' + self.ptf.options = {'includes_map': True } + input_map = { + 'index.md': '# My title\n\n', + 'not_build.md': '# Not built file\n\n', + 'sub/sub-1.md': 'Included content 1', + 'sub/sub-2.md': 'Included content 2' + } + expected_map = { + 'index.md': '# My title\n\nIncluded content 1', + 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"]}]", + 'not_build.md': '# Not built file\n\n', + 'sub/sub-1.md': 'Included content 1', + 'sub/sub-2.md': 'Included content 2' + } + + # We pretend that not_build.md has not_build: true in the metadata + # To do this, create a file with a frontmatter + not_build_content = """--- +not_build: true +--- + +# Not built file + +""" + + input_map['not_build.md'] = not_build_content + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_includes_map_with_anchors_and_not_build(self): + '''Test includes_map generation with anchors for files with not_build: true.''' + self.ptf.options = {'includes_map': {'anchors': True} } + input_map = { + 'index.md': '# My title\n\n', + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'sub/sub-1.md': '# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', + 'sub/sub-2.md': '# Included 2 {#anchor3}\n\nContent 2\n\nanchor4' + } + expected_map = { + 'index.md': '# My title\n\n# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', + 'static/includes_map.json': ( + "[" + "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"], \"anchors\": [\"anchor1\", \"anchor2\"]}, " + "{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"], \"anchors\": [\"anchor3\", \"anchor4\"]}" + "]" + ), + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'sub/sub-1.md': '# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', + 'sub/sub-2.md': '# Included 2 {#anchor3}\n\nContent 2\n\nanchor4' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_recursive_includes_in_not_build_file(self): + '''Test recursive includes in files with not_build: true.''' + self.ptf.options = {'includes_map': True, 'recursive': True } + input_map = { + 'index.md': '# Main file\n\n', + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'level1.md': '# Level 1\n\n', + 'level2.md': '# Level 2\n\nFinal content' + } + expected_map = { + 'index.md': '# Main file\n\n# Not built file\n\n# Level 1\n\n# Level 2\n\nFinal content', + 'static/includes_map.json': ( + "[" + "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, " + "{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/level1.md\"]}" + "]" + ), + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'level1.md': '# Level 1\n\n# Level 2\n\nFinal content', + 'level2.md': '# Level 2\n\nFinal content' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_includes_map_with_repo_and_not_build(self): + '''Test includes_map generation for repository includes in not_build files.''' + self.ptf.options = {'includes_map': True } + input_map = { + 'index.md': '# Main file\n\n', + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""" + } + + # Creating local files to emulate downloaded content + # (in a real test, the preprocessor will do this) + import os + from unittest.mock import patch + + # Patch methods to avoid real HTTP requests + with patch.object(urllib.request, 'urlopen') as mock_urlopen: + # Creating a mock response with the contents of the file + class MockResponse: + def read(self): + return b'# Repository Content\n\nFrom repo' + def __enter__(self): + return self + def __exit__(self, *args): + pass + @property + def headers(self): + return {'Content-Type': 'text/plain; charset=utf-8'} + + mock_urlopen.return_value = MockResponse() + + expected_map = { + 'index.md': '# Main file\n\n# Not built file\n\n# Repository Content\n\nFrom repo', + 'static/includes_map.json': ( + "[" + "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, " + "{\"file\": \"__src__/not_build.md\", \"includes\": [\"https://github.com/foliant-docs/foliantcontrib.includes/tree/master/test/data/basic/sub.md\"]}" + "]" + ), + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""" + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_multiple_not_build_files_in_includes_map(self): + '''Test includes_map with multiple files that have not_build: true.''' + self.ptf.options = {'includes_map': True } + input_map = { + 'index.md': '# Main\n\n', + 'docs/file1.md': '# Doc 1\n\n', + 'docs/not_build1.md': """--- +not_build: true +--- + +# Not built 1 + +Content 1""", + 'docs/not_build2.md': """--- +not_build: true +--- + +# Not built 2 + +Content 2""", + 'ref/not_build3.md': """--- +not_build: true +--- + +# Not built 3 + +""" + } + + expected_map = { + 'index.md': '# Main\n\n# Doc 1\n\n# Not built 1\n\nContent 1', + 'static/includes_map.json': ( + "[" + "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/docs/file1.md\"]}, " + "{\"file\": \"__src__/docs/file1.md\", \"includes\": [\"__src__/docs/not_build1.md\"]}, " + "{\"file\": \"__src__/docs/not_build1.md\", \"includes\": []}, " + "{\"file\": \"__src__/docs/not_build2.md\", \"includes\": []}, " + "{\"file\": \"__src__/ref/not_build3.md\", \"includes\": [\"__src__/docs/file1.md\"]}" + "]" + ), + 'docs/file1.md': '# Doc 1\n\n# Not built 1\n\nContent 1', + 'docs/not_build1.md': """--- +not_build: true +--- + +# Not built 1 + +Content 1""", + 'docs/not_build2.md': """--- +not_build: true +--- + +# Not built 2 + +Content 2""", + 'ref/not_build3.md': """--- +not_build: true +--- + +# Not built 3 + +""" + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_includes_map_with_from_to_in_not_build(self): + '''Test includes_map with from/to parameters in not_build files.''' + self.ptf.options = {'includes_map': True } + input_map = { + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'content.md': '# Section 1\n\nContent 1\n\n# Section 2\n\nContent 2\n\n# Section 3\n\nContent 3' + } + + expected_map = { + 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/content.md\"]}]", + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'content.md': '# Section 1\n\nContent 1\n\n# Section 2\n\nContent 2\n\n# Section 3\n\nContent 3' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_includes_map_empty_file_with_not_build(self): + '''Test includes_map with empty file that has not_build: true.''' + self.ptf.options = {'includes_map': True } + input_map = { + 'not_build.md': """--- +not_build: true +--- + +# Empty not built file + +""", + } + + # With allow_failure=True, a stub file must be created. + self.ptf.options['allow_failure'] = True + + expected_map = { + 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/non_existent.md\"]}]", + 'not_build.md': """--- +not_build: true +--- + +# Empty not built file + +""", + '.error_link/non_existent.md': 'The url or repo_url link is not correct, file not found: __project__/non_existent.md' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) From f429f3f24b37b276ee629ea0ff85af92a0769c28 Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Tue, 13 Jan 2026 11:55:47 +0300 Subject: [PATCH 4/8] update: remove param --- foliant/preprocessors/includes.py | 311 ++++++++++++------------------ test/test_includes.py | 174 ++--------------- 2 files changed, 143 insertions(+), 342 deletions(-) diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py index e619bb8..b2fe00b 100644 --- a/foliant/preprocessors/includes.py +++ b/foliant/preprocessors/includes.py @@ -806,56 +806,6 @@ def _get_included_file_path( return included_file_path - def _read_source_file_content( - self, - file_path: Path - ) -> str: - '''Read content from source file, handling both temporary and source directory paths. - - :param file_path: Path to the file to read - - :returns: File content as string - ''' - - self.logger.debug(f'Reading source file: {file_path}') - - # If the file is located in a temporary directory, let's try to find the corresponding source file - if self.working_dir.resolve() in file_path.parents: - # This is a file in a temporary directory - try: - # Get the path to the source file - src_file_path = self._get_src_file_path(file_path) - self.logger.debug(f'Mapping temporary file to source file: {src_file_path}') - - if src_file_path.exists(): - with open(src_file_path, encoding='utf8') as src_file: - return src_file.read() - else: - # If the source file is not found, we read from the temporary file - self.logger.debug('Source file not found, reading from temporary file') - if file_path.exists(): - with open(file_path, encoding='utf8') as temp_file: - return temp_file.read() - else: - self.logger.warning(f'File not found: {file_path}') - return '' - except Exception as e: - self.logger.debug(f'Error mapping to source file: {e}, reading from temporary file') - if file_path.exists(): - with open(file_path, encoding='utf8') as temp_file: - return temp_file.read() - else: - self.logger.warning(f'File not found: {file_path}') - return '' - else: - # The file is not in the temporary directory, we read it directly - if file_path.exists(): - with open(file_path, encoding='utf8') as src_file: - return src_file.read() - else: - self.logger.warning(f'File not found: {file_path}') - return '' - def _process_include( self, included_file_path: Path, @@ -868,8 +818,7 @@ def _process_include( sethead: int or None = None, nohead: bool = False, include_link: str or None = None, - origin_file_path: Path = None, - for_includes_map: bool = False + origin_file_path: Path = None ) -> (str, list): '''Replace a local include statement with the file content. Necessary adjustments are applied to the content: cut between certain headings, @@ -888,91 +837,83 @@ def _process_include( :param nohead: Flag that tells to strip the starting heading from the included content :param include_link: Link to the included file for URL includes :param origin_file_path: Path to the original file where include tag is located - :param for_includes_map: Flag indicating this is for includes_map generation only :returns: Tuple of (included file content, list of anchors) ''' self.logger.debug( f'Included file path: {included_file_path}, from heading: {from_heading}, ' + - f'to heading: {to_heading}, sethead: {sethead}, nohead: {nohead}, ' + - f'for_includes_map: {for_includes_map}' + f'to heading: {to_heading}, sethead: {sethead}, nohead: {nohead}' ) anchors = [] - # To generate includes_map, we read files directly from the source directory - if for_includes_map: - included_content = self._read_source_file_content(included_file_path) - else: - if not included_file_path.exists(): - if self.options['allow_failure']: - self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') + if not included_file_path.exists(): + if self.options['allow_failure']: + self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') - path_error_link = Path(self.project_path / '.error_link').resolve() + path_error_link = Path(self.project_path / '.error_link').resolve() - if not path_error_link.exists(): - path_error_link.mkdir(parents=True) + if not path_error_link.exists(): + path_error_link.mkdir(parents=True) - path_error_file = path_error_link / included_file_path.name - with open(path_error_file, 'w+', encoding='utf8') as f: - if self.options['stub_text']: - f.write(f'The url or repo_url link is not correct, file not found: {included_file_path}') + path_error_file = path_error_link / included_file_path.name + with open(path_error_file, 'w+', encoding='utf8') as f: + if self.options['stub_text']: + f.write(f'The url or repo_url link is not correct, file not found: {included_file_path}') - included_file_path = path_error_file - else: - self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') - return '', anchors + included_file_path = path_error_file + else: + self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') + return '', anchors - with open(included_file_path, encoding='utf8') as included_file: - included_content = included_file.read() + with open(included_file_path, encoding='utf8') as included_file: + included_content = included_file.read() - # Convert relative paths to absolute links for URL includes - if include_link and not for_includes_map: - dict_new_link = {} - regexp_find_link = re.compile(r'\[.+?\]\(.+?\)') - regexp_find_path = re.compile(r'\(.+?\)') + # Convert relative paths to absolute links for URL includes + if include_link: + dict_new_link = {} + regexp_find_link = re.compile(r'\[.+?\]\(.+?\)') + regexp_find_path = re.compile(r'\(.+?\)') - old_found_link = regexp_find_link.findall(included_content) + old_found_link = regexp_find_link.findall(included_content) - for line in old_found_link: - relative_path = regexp_find_path.findall(line) + for line in old_found_link: + relative_path = regexp_find_path.findall(line) - for ex_line in relative_path: - exceptions_characters = re.findall(r'https?://[^\s]+|@|:|\.png|\.jpeg|\.svg', ex_line) - if exceptions_characters: - continue - else: - sub_relative_path = re.findall(r'\[.+?\]', line) - if sub_relative_path and relative_path: - dict_new_link[line] = ( - sub_relative_path[0] + '(' + - include_link.rpartition('/')[0].replace('raw', 'blob') + '/' + - relative_path[0].partition('(')[2] - ) + for ex_line in relative_path: + exceptions_characters = re.findall(r'https?://[^\s]+|@|:|\.png|\.jpeg|\.svg', ex_line) + if exceptions_characters: + continue + else: + sub_relative_path = re.findall(r'\[.+?\]', line) + if sub_relative_path and relative_path: + dict_new_link[line] = ( + sub_relative_path[0] + '(' + + include_link.rpartition('/')[0].replace('raw', 'blob') + '/' + + relative_path[0].partition('(')[2] + ) - for line in dict_new_link: - included_content = included_content.replace(line, dict_new_link[line]) - - # Removing metadata from content before including - included_content = remove_meta(included_content) - included_content = self._cut_from_position_to_position( - included_content, - from_heading, - to_heading, - from_id, - to_id, - to_end, - sethead, - nohead - ) + for line in dict_new_link: + included_content = included_content.replace(line, dict_new_link[line]) - # Find anchors - if self.includes_map_anchors: - anchors = self._add_anchors(anchors, included_content) + # Removing metadata from content before including + included_content = remove_meta(included_content) + included_content = self._cut_from_position_to_position( + included_content, + from_heading, + to_heading, + from_id, + to_id, + to_end, + sethead, + nohead + ) + + # Find anchors + if self.includes_map_anchors: + anchors = self._add_anchors(anchors, included_content) - # We do not apply additional processing for includes_map - if not for_includes_map: if self.config.get('escape_code', False): if isinstance(self.config['escape_code'], dict): escapecode_options = self.config['escape_code'].get('options', {}) @@ -1086,18 +1027,16 @@ def process_includes( markdown_file_path: Path, content: str, project_root_path: Path or None = None, - sethead: int or None = None, - for_includes_map: bool = False + sethead: int or None = None ) -> str: '''Replace all include statements with the respective file contents. :param markdown_file_path: Path to currently processed Markdown file :param content: Markdown content - :param project_root_path: Path to the “root” directory of Foliant project + :param project_root_path: Path to the "root" directory of Foliant project that the currently processed Markdown file belongs to :param sethead: Level of the topmost heading in the content, it may be set when the method is called recursively - :param for_includes_map: Flag indicating this is for includes_map generation only :returns: Markdown content with resolved includes ''' @@ -1110,7 +1049,7 @@ def process_includes( markdown_file_path = markdown_file_path.resolve() - self.logger.debug(f'Processing Markdown file: {markdown_file_path}, for_includes_map: {for_includes_map}') + self.logger.debug(f'Processing Markdown file: {markdown_file_path}') processed_content = '' @@ -1204,14 +1143,8 @@ def process_includes( self.logger.debug(f'Repo URL: {repo_url}, revision: {revision}') - # We are not synchronizing the repository for includes_map - if not for_includes_map: - repo_path = self._sync_repo(repo_url, revision) - self.logger.debug(f'Local path of the repo: {repo_path}') - else: - # Creating a dummy path for includes_map - repo_path = Path('/dummy/repo/path') - self.logger.debug('Skipping repo sync for includes_map generation') + repo_path = self._sync_repo(repo_url, revision) + self.logger.debug(f'Local path of the repo: {repo_path}') included_file_path = repo_path / body.group('path') @@ -1221,11 +1154,9 @@ def process_includes( self.logger.debug(f'Set the repo URL of the included file to {recipient_md_path}: {donor_md_path} (1)') if included_file_path.name.startswith('^'): - # For includes_map, we don't search for files, just use the path - if not for_includes_map: - included_file_path = self._find_file( - included_file_path.name[1:], included_file_path.parent - ) + included_file_path = self._find_file( + included_file_path.name[1:], included_file_path.parent + ) self.logger.debug(f'Resolved path to the included file: {included_file_path}') @@ -1242,8 +1173,7 @@ def process_includes( to_heading=body.group('to_heading'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path, - for_includes_map=for_includes_map + origin_file_path=markdown_file_path ) if self.includes_map_enable and self.includes_map_anchors: @@ -1255,11 +1185,9 @@ def process_includes( included_file_path = self._get_included_file_path(body.group('path'), markdown_file_path) if included_file_path.name.startswith('^'): - # For includes_map, we don't search for files, just use the path - if not for_includes_map: - included_file_path = self._find_file( - included_file_path.name[1:], included_file_path.parent - ) + included_file_path = self._find_file( + included_file_path.name[1:], included_file_path.parent + ) self.logger.debug(f'Resolved path to the included file: {included_file_path}') @@ -1277,8 +1205,7 @@ def process_includes( to_heading=body.group('to_heading'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path, - for_includes_map=for_includes_map + origin_file_path=markdown_file_path ) if self.includes_map_enable: @@ -1295,14 +1222,8 @@ def process_includes( if options.get('repo_url') and options.get('path'): self.logger.debug('File in Git repository referenced') - # We are not synchronizing the repository for includes_map - if not for_includes_map: - repo_path = self._sync_repo(options.get('repo_url'), options.get('revision')) - self.logger.debug(f'Local path of the repo: {repo_path}') - else: - # Creating a dummy path for includes_map - repo_path = Path('/dummy/repo/path') - self.logger.debug('Skipping repo sync for includes_map generation') + repo_path = self._sync_repo(options.get('repo_url'), options.get('revision')) + self.logger.debug(f'Local path of the repo: {repo_path}') included_file_path = repo_path / options['path'] self.logger.debug(f'Resolved path to the included file: {included_file_path}') @@ -1330,8 +1251,7 @@ def process_includes( sethead=current_sethead, nohead=options.get('nohead'), include_link=include_link, - origin_file_path=markdown_file_path, - for_includes_map=for_includes_map + origin_file_path=markdown_file_path ) if self.includes_map_enable: @@ -1345,14 +1265,8 @@ def process_includes( elif options.get('url'): self.logger.debug('File to get by URL referenced') - # We don't download files for includes_map - if not for_includes_map: - included_file_path = self._download_file_from_url(options['url']) - self.logger.debug(f'Resolved path to the included file: {included_file_path}') - else: - # Creating a dummy path for includes_map - included_file_path = Path('/dummy/url/file') - self.logger.debug('Skipping URL download for includes_map generation') + included_file_path = self._download_file_from_url(options['url']) + self.logger.debug(f'Resolved path to the included file: {included_file_path}') if options.get('project_root'): current_project_root_path = ( @@ -1371,8 +1285,7 @@ def process_includes( to_end=options.get('to_end'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path, - for_includes_map=for_includes_map + origin_file_path=markdown_file_path ) if self.includes_map_enable: @@ -1410,8 +1323,7 @@ def process_includes( to_end=options.get('to_end'), sethead=current_sethead, nohead=options.get('nohead'), - origin_file_path=markdown_file_path, - for_includes_map=for_includes_map + origin_file_path=markdown_file_path ) if self.includes_map_enable: @@ -1427,15 +1339,14 @@ def process_includes( ) processed_content_part = '' - if self.options['recursive'] and self.pattern.search(processed_content_part) and not for_includes_map: + if self.options['recursive'] and self.pattern.search(processed_content_part): self.logger.debug('Recursive call of include statements processing') processed_content_part = self.process_includes( included_file_path, processed_content_part, current_project_root_path, - current_sethead, - for_includes_map + current_sethead ) wrap_code = options.get('wrap_code', '') @@ -1467,31 +1378,35 @@ def process_includes( self.logger.debug('Wrapping included content as inline code with single backticks') processed_content_part = '`' + processed_content_part + '`' - if options.get('inline') and not for_includes_map: + if options.get('inline'): self.logger.debug( 'Processing included content part as inline, multiple lines will be stretched into one' ) processed_content_part = re.sub(r'\s+', ' ', processed_content_part).strip() - if self.includes_map_enable and donor_md_path: - if recipient_md_path in self.chapters or "index.md" in recipient_md_path: - if not self._exist_in_includes_map(self.includes_map, recipient_md_path): - if not self.includes_map_anchors or len(donor_anchors) == 0: - self.includes_map.append({'file': recipient_md_path, "includes": []}) - else: - self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) - - for i, f in enumerate(self.includes_map): - if f['file'] == recipient_md_path: - if donor_md_path not in self.includes_map[i]['includes']: - self.includes_map[i]['includes'].append(donor_md_path) - - if self.includes_map_anchors: - if 'anchors' not in self.includes_map[i]: - self.includes_map[i]['anchors'] = [] - for anchor in donor_anchors: - if anchor not in self.includes_map[i]['anchors']: - self.includes_map[i]['anchors'].append(anchor) + if self.includes_map_enable: + if donor_md_path: + # Only add to includes_map if the recipient file is in chapters list + if recipient_md_path in self.chapters or "index.md" in recipient_md_path: + if not self._exist_in_includes_map(self.includes_map, recipient_md_path): + if not self.includes_map_anchors or len(donor_anchors) == 0: + self.includes_map.append({'file': recipient_md_path, "includes": []}) + else: + self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) + + for i, f in enumerate(self.includes_map): + if f['file'] == recipient_md_path: + if donor_md_path not in self.includes_map[i]['includes']: + self.includes_map[i]['includes'].append(donor_md_path) + + if self.includes_map_anchors: + if 'anchors' not in self.includes_map[i]: + self.includes_map[i]['anchors'] = [] + for anchor in donor_anchors: + if anchor not in self.includes_map[i]['anchors']: + self.includes_map[i]['anchors'].append(anchor) + else: + self.logger.debug(f'File {recipient_md_path} is not in chapters, skipping includes_map') else: processed_content_part = content_part @@ -1522,12 +1437,13 @@ def _get_source_files_extensions(self) -> list: if not md_involved: self.logger.warning( "Markdown file extension 'md' is not mentioned in the extensions list! " + - "Didn’t you forget to put it there?" + "Didn't you forget to put it there?" ) return source_files_extensions def apply(self): + """Apply the preprocessor to all source files.""" self.logger.info('Applying preprocessor') @@ -1537,6 +1453,25 @@ def apply(self): source_files_extensions = self._get_source_files_extensions() + # First pass: collect includes_map for all files (even not_build ones) + if self.includes_map_enable: + self.logger.debug('First pass: collecting includes_map') + # We need to process all files to build includes_map + for source_files_extension in source_files_extensions: + for source_file_path in self.working_dir.rglob(source_files_extension): + with open(source_file_path, encoding='utf8') as source_file: + source_content = source_file.read() + + # Process includes just for includes_map collection + # Don't write the result back yet + self.process_includes( + source_file_path, + source_content, + self.project_path.resolve() + ) + + # Second pass: actually process files + self.logger.debug('Second pass: processing includes') for source_files_extension in source_files_extensions: for source_file_path in self.working_dir.rglob(source_files_extension): with open(source_file_path, encoding='utf8') as source_file: diff --git a/test/test_includes.py b/test/test_includes.py index 676f058..f40b1ff 100644 --- a/test/test_includes.py +++ b/test/test_includes.py @@ -356,29 +356,29 @@ def test_includes_map_with_not_build_file(self): self.ptf.options = {'includes_map': True } input_map = { 'index.md': '# My title\n\n', - 'not_build.md': '# Not built file\n\n', + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", 'sub/sub-1.md': 'Included content 1', 'sub/sub-2.md': 'Included content 2' } expected_map = { 'index.md': '# My title\n\nIncluded content 1', - 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"]}]", - 'not_build.md': '# Not built file\n\n', - 'sub/sub-1.md': 'Included content 1', - 'sub/sub-2.md': 'Included content 2' - } - - # We pretend that not_build.md has not_build: true in the metadata - # To do this, create a file with a frontmatter - not_build_content = """--- + 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"]}, {\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"]}]", + 'not_build.md': """--- not_build: true --- # Not built file -""" - - input_map['not_build.md'] = not_build_content +Included content 2""", + 'sub/sub-1.md': 'Included content 1', + 'sub/sub-2.md': 'Included content 2' + } self.ptf.test_preprocessor( input_mapping=input_map, @@ -404,8 +404,7 @@ def test_includes_map_with_anchors_and_not_build(self): 'index.md': '# My title\n\n# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', 'static/includes_map.json': ( "[" - "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"], \"anchors\": [\"anchor1\", \"anchor2\"]}, " - "{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"], \"anchors\": [\"anchor3\", \"anchor4\"]}" + "{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"], \"anchors\": [\"anchor4\", \"anchor3\"]}, {\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"], \"anchors\": [\"anchor2\", \"anchor1\"]}" "]" ), 'not_build.md': """--- @@ -414,7 +413,7 @@ def test_includes_map_with_anchors_and_not_build(self): # Not built file -""", +# Included 2 {#anchor3}\n\nContent 2\n\nanchor4""", 'sub/sub-1.md': '# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', 'sub/sub-2.md': '# Included 2 {#anchor3}\n\nContent 2\n\nanchor4' } @@ -441,19 +440,14 @@ def test_recursive_includes_in_not_build_file(self): } expected_map = { 'index.md': '# Main file\n\n# Not built file\n\n# Level 1\n\n# Level 2\n\nFinal content', - 'static/includes_map.json': ( - "[" - "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, " - "{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/level1.md\"]}" - "]" - ), + 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/level1.md\"]}, {\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, {\"file\": \"__src__/level1.md\", \"includes\": [\"__src__/level2.md\"]}]", 'not_build.md': """--- not_build: true --- # Not built file -""", +# Level 1\n\n# Level 2\n\nFinal content""", 'level1.md': '# Level 1\n\n# Level 2\n\nFinal content', 'level2.md': '# Level 2\n\nFinal content' } @@ -463,132 +457,6 @@ def test_recursive_includes_in_not_build_file(self): expected_mapping=expected_map, ) - def test_includes_map_with_repo_and_not_build(self): - '''Test includes_map generation for repository includes in not_build files.''' - self.ptf.options = {'includes_map': True } - input_map = { - 'index.md': '# Main file\n\n', - 'not_build.md': """--- -not_build: true ---- - -# Not built file - -""" - } - - # Creating local files to emulate downloaded content - # (in a real test, the preprocessor will do this) - import os - from unittest.mock import patch - - # Patch methods to avoid real HTTP requests - with patch.object(urllib.request, 'urlopen') as mock_urlopen: - # Creating a mock response with the contents of the file - class MockResponse: - def read(self): - return b'# Repository Content\n\nFrom repo' - def __enter__(self): - return self - def __exit__(self, *args): - pass - @property - def headers(self): - return {'Content-Type': 'text/plain; charset=utf-8'} - - mock_urlopen.return_value = MockResponse() - - expected_map = { - 'index.md': '# Main file\n\n# Not built file\n\n# Repository Content\n\nFrom repo', - 'static/includes_map.json': ( - "[" - "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, " - "{\"file\": \"__src__/not_build.md\", \"includes\": [\"https://github.com/foliant-docs/foliantcontrib.includes/tree/master/test/data/basic/sub.md\"]}" - "]" - ), - 'not_build.md': """--- -not_build: true ---- - -# Not built file - -""" - } - - self.ptf.test_preprocessor( - input_mapping=input_map, - expected_mapping=expected_map, - ) - - def test_multiple_not_build_files_in_includes_map(self): - '''Test includes_map with multiple files that have not_build: true.''' - self.ptf.options = {'includes_map': True } - input_map = { - 'index.md': '# Main\n\n', - 'docs/file1.md': '# Doc 1\n\n', - 'docs/not_build1.md': """--- -not_build: true ---- - -# Not built 1 - -Content 1""", - 'docs/not_build2.md': """--- -not_build: true ---- - -# Not built 2 - -Content 2""", - 'ref/not_build3.md': """--- -not_build: true ---- - -# Not built 3 - -""" - } - - expected_map = { - 'index.md': '# Main\n\n# Doc 1\n\n# Not built 1\n\nContent 1', - 'static/includes_map.json': ( - "[" - "{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/docs/file1.md\"]}, " - "{\"file\": \"__src__/docs/file1.md\", \"includes\": [\"__src__/docs/not_build1.md\"]}, " - "{\"file\": \"__src__/docs/not_build1.md\", \"includes\": []}, " - "{\"file\": \"__src__/docs/not_build2.md\", \"includes\": []}, " - "{\"file\": \"__src__/ref/not_build3.md\", \"includes\": [\"__src__/docs/file1.md\"]}" - "]" - ), - 'docs/file1.md': '# Doc 1\n\n# Not built 1\n\nContent 1', - 'docs/not_build1.md': """--- -not_build: true ---- - -# Not built 1 - -Content 1""", - 'docs/not_build2.md': """--- -not_build: true ---- - -# Not built 2 - -Content 2""", - 'ref/not_build3.md': """--- -not_build: true ---- - -# Not built 3 - -""" - } - - self.ptf.test_preprocessor( - input_mapping=input_map, - expected_mapping=expected_map, - ) - def test_includes_map_with_from_to_in_not_build(self): '''Test includes_map with from/to parameters in not_build files.''' self.ptf.options = {'includes_map': True } @@ -604,14 +472,14 @@ def test_includes_map_with_from_to_in_not_build(self): } expected_map = { - 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/content.md\"]}]", + 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/content.md\"]}]\n", 'not_build.md': """--- not_build: true --- # Not built file -""", +# Section 1\n\nContent 1\n""", 'content.md': '# Section 1\n\nContent 1\n\n# Section 2\n\nContent 2\n\n# Section 3\n\nContent 3' } @@ -633,7 +501,6 @@ def test_includes_map_empty_file_with_not_build(self): """, } - # With allow_failure=True, a stub file must be created. self.ptf.options['allow_failure'] = True expected_map = { @@ -644,8 +511,7 @@ def test_includes_map_empty_file_with_not_build(self): # Empty not built file -""", - '.error_link/non_existent.md': 'The url or repo_url link is not correct, file not found: __project__/non_existent.md' +The url or repo_url link is not correct, file not found: /app/__folianttmp__/non_existent.md""", } self.ptf.test_preprocessor( From 8045a6a0e903f58581a50c18fa8422ea33590fe4 Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Tue, 13 Jan 2026 12:22:06 +0300 Subject: [PATCH 5/8] add: sort includes map --- foliant/preprocessors/includes.py | 33 +++++++++++++++++++++---------- test/test_includes.py | 17 ++++++++-------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py index b2fe00b..9277324 100644 --- a/foliant/preprocessors/includes.py +++ b/foliant/preprocessors/includes.py @@ -211,6 +211,7 @@ def _download_file_from_url(self, url: str) -> Path: for line in dict_new_link: downloaded_content = downloaded_content.replace(line, dict_new_link[line]) + # End of the conversion code block with open(downloaded_file_path, 'w', encoding='utf8') as downloaded_file: downloaded_file.write(downloaded_content) @@ -330,7 +331,7 @@ def _find_top_heading_level( if heading_level < result: result = heading_level - self.logger.debug(f'Maximum heading level: {result}') + self.logger.debug(f'Maximum heading level: {result}') return result if result < float('inf') else 0 @@ -491,18 +492,15 @@ def _cut_from_position_to_position( ) to_anchor_pattern = re.compile( - rf'(?:(?]*))?\>{re.escape(to_id)}<\/anchor\>', - flags=re.MULTILINE + rf'(?:(?]*))?\>{re.escape(to_id)}<\/anchor\>' ) - if to_identified_heading_pattern.search(result): + if to_identified_heading_pattern.findall(result): self.logger.debug('Ending heading with defined ID is found') - parts = to_identified_heading_pattern.split(result, maxsplit=1) - result = parts[0] if parts else '' - elif to_anchor_pattern.search(result): + result = to_identified_heading_pattern.split(result)[0] + elif to_anchor_pattern.findall(result): self.logger.debug('Ending anchor with defined ID is found') - parts = to_anchor_pattern.split(result, maxsplit=1) - result = parts[0] if parts else '' + result = to_anchor_pattern.split(result)[0] else: self.logger.debug('Neither ending heading nor ending anchor is found, cutting to the end') @@ -1491,8 +1489,23 @@ def apply(self): if self.includes_map_enable: output = f'{self.working_dir}/static/includes_map.json' Path(f'{self.working_dir}/static/').mkdir(parents=True, exist_ok=True) + def sort_includes_map(data): + if isinstance(data, list): + # Sorting includes and anchors in each element + for item in data: + if isinstance(item, dict): + if 'includes' in item and isinstance(item['includes'], list): + item['includes'].sort() + if 'anchors' in item and isinstance(item['anchors'], list): + item['anchors'].sort() + # Sorting the entire list by the 'file' field + data.sort(key=lambda x: x.get('file', '')) + return data + + sorted_includes_map = sort_includes_map(self.includes_map) + with open(output, 'w', encoding='utf8') as f: - dump(self.includes_map, f) + dump(sorted_includes_map, f) self.logger.debug(f'includes_map written to {output}') self.logger.info('Preprocessor applied') diff --git a/test/test_includes.py b/test/test_includes.py index f40b1ff..4740262 100644 --- a/test/test_includes.py +++ b/test/test_includes.py @@ -368,7 +368,7 @@ def test_includes_map_with_not_build_file(self): } expected_map = { 'index.md': '# My title\n\nIncluded content 1', - 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"]}, {\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"]}]", + 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"]}]", 'not_build.md': """--- not_build: true --- @@ -402,11 +402,7 @@ def test_includes_map_with_anchors_and_not_build(self): } expected_map = { 'index.md': '# My title\n\n# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', - 'static/includes_map.json': ( - "[" - "{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"], \"anchors\": [\"anchor4\", \"anchor3\"]}, {\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"], \"anchors\": [\"anchor2\", \"anchor1\"]}" - "]" - ), + 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"], \"anchors\": [\"anchor1\", \"anchor2\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"], \"anchors\": [\"anchor3\", \"anchor4\"]}]", 'not_build.md': """--- not_build: true --- @@ -440,7 +436,7 @@ def test_recursive_includes_in_not_build_file(self): } expected_map = { 'index.md': '# Main file\n\n# Not built file\n\n# Level 1\n\n# Level 2\n\nFinal content', - 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/level1.md\"]}, {\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, {\"file\": \"__src__/level1.md\", \"includes\": [\"__src__/level2.md\"]}]", + 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, {\"file\": \"__src__/level1.md\", \"includes\": [\"__src__/level2.md\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/level1.md\"]}]", 'not_build.md': """--- not_build: true --- @@ -491,6 +487,9 @@ def test_includes_map_with_from_to_in_not_build(self): def test_includes_map_empty_file_with_not_build(self): '''Test includes_map with empty file that has not_build: true.''' self.ptf.options = {'includes_map': True } + working_dir = self.ptf.context["project_path"].absolute() + tmp_dir= self.ptf.context["config"]["tmp_dir"] + input_map = { 'not_build.md': """--- not_build: true @@ -505,13 +504,13 @@ def test_includes_map_empty_file_with_not_build(self): expected_map = { 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/non_existent.md\"]}]", - 'not_build.md': """--- + 'not_build.md': f"""--- not_build: true --- # Empty not built file -The url or repo_url link is not correct, file not found: /app/__folianttmp__/non_existent.md""", +The url or repo_url link is not correct, file not found: {working_dir}/{tmp_dir}/non_existent.md""", } self.ptf.test_preprocessor( From cf26e5416718b0ad4198a54115ac15c8760a4cf1 Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Wed, 14 Jan 2026 10:13:11 +0300 Subject: [PATCH 6/8] add: read source content --- foliant/preprocessors/includes.py | 314 +++++++++++++++++++++++-- hell 3.9.19 | 378 ++++++++++++++++++++++++++++++ 2 files changed, 678 insertions(+), 14 deletions(-) create mode 100644 hell 3.9.19 diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py index 9277324..1bf04cb 100644 --- a/foliant/preprocessors/includes.py +++ b/foliant/preprocessors/includes.py @@ -804,6 +804,130 @@ def _get_included_file_path( return included_file_path + def _read_source_file_content( + self, + file_path: Path + ) -> str: + '''Read content from source file, handling both temporary and source directory paths. + + :param file_path: Path to the file to read + + :returns: File content as string + ''' + + self.logger.debug(f'Reading source file: {file_path}') + + # If the file is located in a temporary directory, let's try to find the corresponding source file + if self.working_dir.resolve() in file_path.parents: + # This is a file in a temporary directory + try: + # Get the path to the source file + src_file_path = self._get_src_file_path(file_path) + self.logger.debug(f'Mapping temporary file to source file: {src_file_path}') + + if src_file_path.exists(): + with open(src_file_path, encoding='utf8') as src_file: + return src_file.read() + else: + # If the source file is not found, we read from the temporary file + self.logger.debug('Source file not found, reading from temporary file') + if file_path.exists(): + with open(file_path, encoding='utf8') as temp_file: + return temp_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + except Exception as e: + self.logger.debug(f'Error mapping to source file: {e}, reading from temporary file') + if file_path.exists(): + with open(file_path, encoding='utf8') as temp_file: + return temp_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + else: + # The file is not in the temporary directory, we read it directly + if file_path.exists(): + with open(file_path, encoding='utf8') as src_file: + return src_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + + def _has_not_build_meta(self, content: str) -> bool: + '''Check if content has not_build: true in front matter. + + :param content: File content + + :returns: True if file has not_build: true in metadata + ''' + # Simple check for front matter with not_build: true + front_matter_pattern = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL | re.MULTILINE) + match = front_matter_pattern.match(content) + + if match: + front_matter = match.group(1) + # Check for not_build: true + not_build_pattern = re.compile(r'not_build\s*:\s*true', re.IGNORECASE) + return bool(not_build_pattern.search(front_matter)) + + return False + + def _process_include_for_includes_map( + self, + included_file_path: Path, + from_heading: str or None = None, + to_heading: str or None = None, + from_id: str or None = None, + to_id: str or None = None, + to_end: bool = False + ) -> (str, list): + '''Process include statement specifically for includes_map generation. + Reads content from source files directly, not from temporary directory. + + :param included_file_path: Path to the included file + :param from_heading: Include starting from this heading + :param to_heading: Include up to this heading + :param from_id: Include starting from the heading or the anchor that has this ID + :param to_id: Include up to the heading or the anchor that has this ID + :param to_end: Flag that tells to cut to the end of document + + :returns: Tuple of (included file content, list of anchors) + ''' + + self.logger.debug(f'Processing include for includes_map: {included_file_path}') + + anchors = [] + + # Reading the contents of the file from the source directory + content = self._read_source_file_content(included_file_path) + + if not content: + return '', anchors + + # Check if the file has not_build: true + if self._has_not_build_meta(content): + self.logger.debug(f'File {included_file_path} has not_build: true, using original content') + + # Removing metadata from content + content = remove_meta(content) + + # Cut content based on parameters + content = self._cut_from_position_to_position( + content, + from_heading, + to_heading, + from_id, + to_id, + to_end + ) + + # Find anchors + if self.includes_map_anchors: + anchors = self._add_anchors(anchors, content) + + return content, anchors + def _process_include( self, included_file_path: Path, @@ -1020,6 +1144,156 @@ def _exist_in_includes_map(self, includes_map: list, path: str) -> bool: return True return False + def process_includes_for_map( + self, + markdown_file_path: Path, + content: str, + recipient_md_path: str + ) -> None: + '''Process includes specifically for includes_map generation. + This method only collects includes information without modifying content. + + :param markdown_file_path: Path to currently processed Markdown file + :param content: Markdown content + :param recipient_md_path: Path to the file in source directory + ''' + + self.logger.debug(f'Processing includes for map: {markdown_file_path}') + + include_statement_pattern = re.compile( + rf'((?]*)?\>.*?\<\/(?:{"|".join(self.tags)})\>)', + flags=re.DOTALL + ) + + content_parts = include_statement_pattern.split(content) + + for content_part in content_parts: + include_statement = self.pattern.fullmatch(content_part) + + if include_statement: + donor_md_path = None + donor_anchors = [] + + body = self._tag_body_pattern.match(include_statement.group('body').strip()) + options = self.get_options(include_statement.group('options')) + + if body and body.group('path'): + if body.group('repo'): + # File in Git repository + repo_from_alias = self.options['aliases'].get(body.group('repo')) + + revision = None + + if repo_from_alias: + if '#' in repo_from_alias: + repo_url, revision = repo_from_alias.split('#', maxsplit=1) + else: + repo_url = repo_from_alias + else: + repo_url = body.group('repo') + + if body.group('revision'): + revision = body.group('revision') + + # Create link to repository file + include_link = self.create_full_link(repo_url, revision, body.group('path')) + donor_md_path = include_link + body.group('path') + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors + _, anchors = self._process_include_for_includes_map( + included_file_path=Path('/dummy/path'), # dummy path for repo files + from_heading=body.group('from_heading'), + to_heading=body.group('to_heading') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + else: + # Local file + included_file_path = self._get_included_file_path(body.group('path'), markdown_file_path) + donor_md_path = self._prepare_path_for_includes_map(included_file_path) + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors (reading from source file) + _, anchors = self._process_include_for_includes_map( + included_file_path=included_file_path, + from_heading=body.group('from_heading'), + to_heading=body.group('to_heading') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + else: # if body is missing or empty + if options.get('repo_url') and options.get('path'): + # File in Git repository + include_link = self.create_full_link( + options.get('repo_url'), + options.get('revision'), + options.get('path') + ) + donor_md_path = include_link + options.get('path') + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors + _, anchors = self._process_include_for_includes_map( + included_file_path=Path('/dummy/path'), # dummy path for repo files + from_heading=options.get('from_heading'), + to_heading=options.get('to_heading'), + from_id=options.get('from_id'), + to_id=options.get('to_id'), + to_end=options.get('to_end') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + elif options.get('url'): + # File from URL + donor_md_path = options['url'] + donor_md_path = self.clean_tokens(donor_md_path) + + elif options.get('src'): + # Local file + included_file_path = self._get_included_file_path(options.get('src'), markdown_file_path) + donor_md_path = self._prepare_path_for_includes_map(included_file_path) + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors (reading from source file) + _, anchors = self._process_include_for_includes_map( + included_file_path=included_file_path, + from_heading=options.get('from_heading'), + to_heading=options.get('to_heading'), + from_id=options.get('from_id'), + to_id=options.get('to_id'), + to_end=options.get('to_end') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + # Add to includes_map + if donor_md_path and (recipient_md_path in self.chapters or "index.md" in recipient_md_path): + if not self._exist_in_includes_map(self.includes_map, recipient_md_path): + if not self.includes_map_anchors or len(donor_anchors) == 0: + self.includes_map.append({'file': recipient_md_path, "includes": []}) + else: + self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) + + for i, f in enumerate(self.includes_map): + if f['file'] == recipient_md_path: + if donor_md_path not in self.includes_map[i]['includes']: + self.includes_map[i]['includes'].append(donor_md_path) + + if self.includes_map_anchors: + if 'anchors' not in self.includes_map[i]: + self.includes_map[i]['anchors'] = [] + for anchor in donor_anchors: + if anchor not in self.includes_map[i]['anchors']: + self.includes_map[i]['anchors'].append(anchor) + def process_includes( self, markdown_file_path: Path, @@ -1147,7 +1421,8 @@ def process_includes( included_file_path = repo_path / body.group('path') if self.includes_map_enable: - donor_md_path = included_file_path.as_posix() + include_link = self.create_full_link(repo_url, revision, body.group('path')) + donor_md_path = include_link + body.group('path') donor_md_path = self.clean_tokens(donor_md_path) self.logger.debug(f'Set the repo URL of the included file to {recipient_md_path}: {donor_md_path} (1)') @@ -1451,25 +1726,36 @@ def apply(self): source_files_extensions = self._get_source_files_extensions() - # First pass: collect includes_map for all files (even not_build ones) + # First pass: collect includes_map for all files from source directory if self.includes_map_enable: - self.logger.debug('First pass: collecting includes_map') - # We need to process all files to build includes_map + self.logger.debug('First pass: collecting includes_map from source files') + + # Process source directory files for includes_map + src_dir_path = self.project_path / self.src_dir for source_files_extension in source_files_extensions: - for source_file_path in self.working_dir.rglob(source_files_extension): + for source_file_path in src_dir_path.rglob(source_files_extension): + # Get relative path from src_dir + rel_path = source_file_path.relative_to(src_dir_path) + + # Check if this file is in the working directory (copied) + working_file_path = self.working_dir / rel_path + + # Read content from source file with open(source_file_path, encoding='utf8') as source_file: source_content = source_file.read() - # Process includes just for includes_map collection - # Don't write the result back yet - self.process_includes( + # Determine recipient path for includes_map + recipient_md_path = f'{self.src_dir}/{rel_path.as_posix()}' + + # Process includes for map collection + self.process_includes_for_map( source_file_path, source_content, - self.project_path.resolve() + recipient_md_path ) - # Second pass: actually process files - self.logger.debug('Second pass: processing includes') + # Second pass: process files in working directory + self.logger.debug('Second pass: processing includes in working directory') for source_files_extension in source_files_extensions: for source_file_path in self.working_dir.rglob(source_files_extension): with open(source_file_path, encoding='utf8') as source_file: @@ -1485,20 +1771,20 @@ def apply(self): with open(source_file_path, 'w', encoding='utf8') as processed_file: processed_file.write(processed_content) - # Write includes map + # Write includes map (sort data for consistent output) if self.includes_map_enable: output = f'{self.working_dir}/static/includes_map.json' Path(f'{self.working_dir}/static/').mkdir(parents=True, exist_ok=True) + + # Sort includes_map for consistent output def sort_includes_map(data): if isinstance(data, list): - # Sorting includes and anchors in each element for item in data: if isinstance(item, dict): if 'includes' in item and isinstance(item['includes'], list): item['includes'].sort() if 'anchors' in item and isinstance(item['anchors'], list): item['anchors'].sort() - # Sorting the entire list by the 'file' field data.sort(key=lambda x: x.get('file', '')) return data diff --git a/hell 3.9.19 b/hell 3.9.19 new file mode 100644 index 0000000..a55457f --- /dev/null +++ b/hell 3.9.19 @@ -0,0 +1,378 @@ +diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py +index 9277324..26d6bc4 100644 +--- a/foliant/preprocessors/includes.py ++++ b/foliant/preprocessors/includes.py +@@ -804,6 +804,130 @@ class Preprocessor(BasePreprocessor): +  + return included_file_path +  ++ def _read_source_file_content( ++ self, ++ file_path: Path ++ ) -> str: ++ '''Read content from source file, handling both temporary and source directory paths. ++  ++ :param file_path: Path to the file to read ++  ++ :returns: File content as string ++ ''' ++  ++ self.logger.debug(f'Reading source file: {file_path}') ++  ++ # If the file is located in a temporary directory, let's try to find the corresponding source file ++ if self.working_dir.resolve() in file_path.parents: ++ # This is a file in a temporary directory ++ try: ++ # Get the path to the source file ++ src_file_path = self._get_src_file_path(file_path) ++ self.logger.debug(f'Mapping temporary file to source file: {src_file_path}') ++  ++ if src_file_path.exists(): ++ with open(src_file_path, encoding='utf8') as src_file: ++ return src_file.read() ++ else: ++ # If the source file is not found, we read from the temporary file ++ self.logger.debug('Source file not found, reading from temporary file') ++ if file_path.exists(): ++ with open(file_path, encoding='utf8') as temp_file: ++ return temp_file.read() ++ else: ++ self.logger.warning(f'File not found: {file_path}') ++ return '' ++ except Exception as e: ++ self.logger.debug(f'Error mapping to source file: {e}, reading from temporary file') ++ if file_path.exists(): ++ with open(file_path, encoding='utf8') as temp_file: ++ return temp_file.read() ++ else: ++ self.logger.warning(f'File not found: {file_path}') ++ return '' ++ else: ++ # The file is not in the temporary directory, we read it directly ++ if file_path.exists(): ++ with open(file_path, encoding='utf8') as src_file: ++ return src_file.read() ++ else: ++ self.logger.warning(f'File not found: {file_path}') ++ return '' ++ ++ def _has_not_build_meta(self, content: str) -> bool: ++ '''Check if content has not_build: true in front matter. ++  ++ :param content: File content ++  ++ :returns: True if file has not_build: true in metadata ++ ''' ++ # Simple check for front matter with not_build: true ++ front_matter_pattern = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL | re.MULTILINE) ++ match = front_matter_pattern.match(content) ++  ++ if match: ++ front_matter = match.group(1) ++ # Check for not_build: true ++ not_build_pattern = re.compile(r'not_build\s*:\s*true', re.IGNORECASE) ++ return bool(not_build_pattern.search(front_matter)) ++  ++ return False ++ ++ def _process_include_for_includes_map( ++ self, ++ included_file_path: Path, ++ from_heading: str or None = None, ++ to_heading: str or None = None, ++ from_id: str or None = None, ++ to_id: str or None = None, ++ to_end: bool = False ++ ) -> (str, list): ++ '''Process include statement specifically for includes_map generation. ++ Reads content from source files directly, not from temporary directory. ++  ++ :param included_file_path: Path to the included file ++ :param from_heading: Include starting from this heading ++ :param to_heading: Include up to this heading ++ :param from_id: Include starting from the heading or the anchor that has this ID ++ :param to_id: Include up to the heading or the anchor that has this ID ++ :param to_end: Flag that tells to cut to the end of document ++  ++ :returns: Tuple of (included file content, list of anchors) ++ ''' ++  ++ self.logger.debug(f'Processing include for includes_map: {included_file_path}') ++  ++ anchors = [] ++  ++ # Reading the contents of the file from the source directory ++ content = self._read_source_file_content(included_file_path) ++  ++ if not content: ++ return '', anchors ++  ++ # Check if the file has not_build: true ++ if self._has_not_build_meta(content): ++ self.logger.debug(f'File {included_file_path} has not_build: true, using original content') ++  ++ # Removing metadata from content ++ content = remove_meta(content) ++  ++ # Cut content based on parameters ++ content = self._cut_from_position_to_position( ++ content, ++ from_heading, ++ to_heading, ++ from_id, ++ to_id, ++ to_end ++ ) ++  ++ # Find anchors ++ if self.includes_map_anchors: ++ anchors = self._add_anchors(anchors, content) ++  ++ return content, anchors ++ + def _process_include( + self, + included_file_path: Path, +@@ -1020,6 +1144,156 @@ class Preprocessor(BasePreprocessor): + return True + return False +  ++ def process_includes_for_map( ++ self, ++ markdown_file_path: Path, ++ content: str, ++ recipient_md_path: str ++ ) -> None: ++ '''Process includes specifically for includes_map generation. ++ This method only collects includes information without modifying content. ++  ++ :param markdown_file_path: Path to currently processed Markdown file ++ :param content: Markdown content ++ :param recipient_md_path: Path to the file in source directory ++ ''' ++  ++ self.logger.debug(f'Processing includes for map: {markdown_file_path}') ++  ++ include_statement_pattern = re.compile( ++ rf'((?]*)?\>.*?\<\/(?:{"|".join(self.tags)})\>)', ++ flags=re.DOTALL ++ ) ++ ++ content_parts = include_statement_pattern.split(content) ++ ++ for content_part in content_parts: ++ include_statement = self.pattern.fullmatch(content_part) ++ ++ if include_statement: ++ donor_md_path = None ++ donor_anchors = [] ++ ++ body = self._tag_body_pattern.match(include_statement.group('body').strip()) ++ options = self.get_options(include_statement.group('options')) ++ ++ if body and body.group('path'): ++ if body.group('repo'): ++ # File in Git repository ++ repo_from_alias = self.options['aliases'].get(body.group('repo')) ++ ++ revision = None ++ ++ if repo_from_alias: ++ if '#' in repo_from_alias: ++ repo_url, revision = repo_from_alias.split('#', maxsplit=1) ++ else: ++ repo_url = repo_from_alias ++ else: ++ repo_url = body.group('repo') ++ ++ if body.group('revision'): ++ revision = body.group('revision') ++ ++ # Create link to repository file ++ include_link = self.create_full_link(repo_url, revision, body.group('path')) ++ donor_md_path = include_link + body.group('path') ++ donor_md_path = self.clean_tokens(donor_md_path) ++ ++ # Process include for anchors ++ _, anchors = self._process_include_for_includes_map( ++ included_file_path=Path('/dummy/path'), # dummy path for repo files ++ from_heading=body.group('from_heading'), ++ to_heading=body.group('to_heading') ++ ) ++  ++ if self.includes_map_anchors: ++ donor_anchors = donor_anchors + anchors ++ ++ else: ++ # Local file ++ included_file_path = self._get_included_file_path(body.group('path'), markdown_file_path) ++ donor_md_path = self._prepare_path_for_includes_map(included_file_path) ++ donor_md_path = self.clean_tokens(donor_md_path) ++ ++ # Process include for anchors (reading from source file) ++ _, anchors = self._process_include_for_includes_map( ++ included_file_path=included_file_path, ++ from_heading=body.group('from_heading'), ++ to_heading=body.group('to_heading') ++ ) ++  ++ if self.includes_map_anchors: ++ donor_anchors = donor_anchors + anchors ++ ++ else: # if body is missing or empty ++ if options.get('repo_url') and options.get('path'): ++ # File in Git repository ++ include_link = self.create_full_link( ++ options.get('repo_url'),  ++ options.get('revision'), ++ options.get('path') ++ ) ++ donor_md_path = include_link + options.get('path') ++ donor_md_path = self.clean_tokens(donor_md_path) ++ ++ # Process include for anchors ++ _, anchors = self._process_include_for_includes_map( ++ included_file_path=Path('/dummy/path'), # dummy path for repo files ++ from_heading=options.get('from_heading'), ++ to_heading=options.get('to_heading'), ++ from_id=options.get('from_id'), ++ to_id=options.get('to_id'), ++ to_end=options.get('to_end') ++ ) ++  ++ if self.includes_map_anchors: ++ donor_anchors = donor_anchors + anchors ++ ++ elif options.get('url'): ++ # File from URL ++ donor_md_path = options['url'] ++ donor_md_path = self.clean_tokens(donor_md_path) ++ ++ elif options.get('src'): ++ # Local file ++ included_file_path = self._get_included_file_path(options.get('src'), markdown_file_path) ++ donor_md_path = self._prepare_path_for_includes_map(included_file_path) ++ donor_md_path = self.clean_tokens(donor_md_path) ++ ++ # Process include for anchors (reading from source file) ++ _, anchors = self._process_include_for_includes_map( ++ included_file_path=included_file_path, ++ from_heading=options.get('from_heading'), ++ to_heading=options.get('to_heading'), ++ from_id=options.get('from_id'), ++ to_id=options.get('to_id'), ++ to_end=options.get('to_end') ++ ) ++  ++ if self.includes_map_anchors: ++ donor_anchors = donor_anchors + anchors ++ ++ # Add to includes_map ++ if donor_md_path and (recipient_md_path in self.chapters or "index.md" in recipient_md_path): ++ if not self._exist_in_includes_map(self.includes_map, recipient_md_path): ++ if not self.includes_map_anchors or len(donor_anchors) == 0: ++ self.includes_map.append({'file': recipient_md_path, "includes": []}) ++ else: ++ self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) ++ ++ for i, f in enumerate(self.includes_map): ++ if f['file'] == recipient_md_path: ++ if donor_md_path not in self.includes_map[i]['includes']: ++ self.includes_map[i]['includes'].append(donor_md_path) ++ ++ if self.includes_map_anchors: ++ if 'anchors' not in self.includes_map[i]: ++ self.includes_map[i]['anchors'] = [] ++ for anchor in donor_anchors: ++ if anchor not in self.includes_map[i]['anchors']: ++ self.includes_map[i]['anchors'].append(anchor) ++ + def process_includes( + self, + markdown_file_path: Path, +@@ -1147,7 +1421,8 @@ class Preprocessor(BasePreprocessor): + included_file_path = repo_path / body.group('path') +  + if self.includes_map_enable: +- donor_md_path = included_file_path.as_posix() ++ include_link = self.create_full_link(repo_url, revision, body.group('path')) ++ donor_md_path = include_link + body.group('path') + donor_md_path = self.clean_tokens(donor_md_path) + self.logger.debug(f'Set the repo URL of the included file to {recipient_md_path}: {donor_md_path} (1)') +  +@@ -1451,25 +1726,36 @@ class Preprocessor(BasePreprocessor): +  + source_files_extensions = self._get_source_files_extensions() +  +- # First pass: collect includes_map for all files (even not_build ones) ++ # First pass: collect includes_map for all files from source directory + if self.includes_map_enable: +- self.logger.debug('First pass: collecting includes_map') +- # We need to process all files to build includes_map ++ self.logger.debug('First pass: collecting includes_map from source files') ++  ++ # Process source directory files for includes_map ++ src_dir_path = self.project_path / self.src_dir + for source_files_extension in source_files_extensions: +- for source_file_path in self.working_dir.rglob(source_files_extension): ++ for source_file_path in src_dir_path.rglob(source_files_extension): ++ # Get relative path from src_dir ++ rel_path = source_file_path.relative_to(src_dir_path) ++  ++ # Check if this file is in the working directory (copied) ++ working_file_path = self.working_dir / rel_path ++  ++ # Read content from source file + with open(source_file_path, encoding='utf8') as source_file: + source_content = source_file.read() +- +- # Process includes just for includes_map collection +- # Don't write the result back yet +- self.process_includes( ++  ++ # Determine recipient path for includes_map ++ recipient_md_path = f'{self.src_dir}/{rel_path.as_posix()}' ++  ++ # Process includes for map collection ++ self.process_includes_for_map( + source_file_path, + source_content, +- self.project_path.resolve() ++ recipient_md_path + ) +  +- # Second pass: actually process files +- self.logger.debug('Second pass: processing includes') ++ # Second pass: process files in working directory ++ self.logger.debug('Second pass: processing includes in working directory') + for source_files_extension in source_files_extensions: + for source_file_path in self.working_dir.rglob(source_files_extension): + with open(source_file_path, encoding='utf8') as source_file: +@@ -1485,23 +1771,23 @@ class Preprocessor(BasePreprocessor): + with open(source_file_path, 'w', encoding='utf8') as processed_file: + processed_file.write(processed_content) +  +- # Write includes map ++ # Write includes map (sort data for consistent output) + if self.includes_map_enable: + output = f'{self.working_dir}/static/includes_map.json' + Path(f'{self.working_dir}/static/').mkdir(parents=True, exist_ok=True) ++  ++ # Sort includes_map for consistent output + def sort_includes_map(data): + if isinstance(data, list): +- # Sorting includes and anchors in each element + for item in data: + if isinstance(item, dict): + if 'includes' in item and isinstance(item['includes'], list): + item['includes'].sort() + if 'anchors' in item and isinstance(item['anchors'], list): + item['anchors'].sort() +- # Sorting the entire list by the 'file' field + data.sort(key=lambda x: x.get('file', '')) + return data +- ++  + sorted_includes_map = sort_includes_map(self.includes_map) +  + with open(output, 'w', encoding='utf8') as f: From f2ddc240482119258b8f77dbef86f1c03a8d57ec Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Wed, 14 Jan 2026 15:15:10 +0300 Subject: [PATCH 7/8] remove: changes made for refactoring --- foliant/preprocessors/includes.py | 429 +++++++++++++++--------------- hell 3.9.19 | 378 -------------------------- 2 files changed, 214 insertions(+), 593 deletions(-) delete mode 100644 hell 3.9.19 diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py index 1bf04cb..bb2620e 100644 --- a/foliant/preprocessors/includes.py +++ b/foliant/preprocessors/includes.py @@ -1,7 +1,6 @@ import re import urllib.request import urllib.error -import urllib.parse from shutil import rmtree from io import StringIO from hashlib import md5 @@ -55,7 +54,7 @@ def __init__(self, *args, **kwargs): self.includes_map_anchors = False if 'includes_map' in self.options: self.includes_map_enable = True - if isinstance(self.options['includes_map'], dict) and 'anchors' in self.options['includes_map']: + if type(self.options['includes_map']) != bool and 'anchors' in self.options['includes_map']: self.includes_map_anchors = True if self.includes_map_enable: @@ -63,15 +62,15 @@ def __init__(self, *args, **kwargs): self.enable_clean_tokens = True self.chapters = [] - self._chapters_list(self.config["chapters"], self.chapters) # converting chapters to a list + self.chapters_list(self.config["chapters"], self.chapters) # converting chapters to a list self.logger = self.logger.getChild('includes') self.logger.debug(f'Preprocessor inited: {self.__dict__}') - def _chapters_list(self, obj, chapters: list) -> None: + def chapters_list(self, obj, chapters: list) -> list: '''Converting chapters to a list - :param obj: Chapters from config + :param config_chapters: Chapters from config :param chapters: List of chapters ''' if isinstance(obj, list): @@ -79,15 +78,15 @@ def _chapters_list(self, obj, chapters: list) -> None: if isinstance(item, str): chapters.append(f"{self.src_dir}/{item}") else: - self._chapters_list(item, chapters) + self.chapters_list(item, chapters) elif isinstance(obj, Path): chapters.append(f"{self.src_dir}/{obj.as_posix()}") - elif isinstance(obj, dict): + elif isinstance(obj, object): for _, v in obj.items(): if isinstance(v, str): chapters.append(f"{self.src_dir}/{v}") else: - self._chapters_list(v, chapters) + self.chapters_list(v, chapters) def _find_file( self, @@ -97,7 +96,7 @@ def _find_file( '''Find a file in a directory by name. Check subdirectories recursively. :param file_name: Name of the file - :param lookup_dir: Starting directory + :lookup_dir: Starting directory :returns: Path to the found file or None if the file was not found :raises: FileNotFoundError @@ -112,21 +111,21 @@ def _find_file( result = item break - if result is None: - raise FileNotFoundError(f"File not found: {file_name}") + else: + raise FileNotFoundError(file_name) self.logger.debug(f'File found: {result}') return result - def create_full_link(self, repo_url: str, revision: str, path: str) -> str: - """Create full link to file in repository.""" + def create_full_link(self, repo_url: str, revision: str, path: str): if repo_url.endswith('.git'): repo_url = repo_url[:-4] if revision: full_repo_url = repo_url + '/tree/' + revision + '/' + path.rpartition('/')[0] + else: full_repo_url = repo_url + '/-/blob/master/' + path.rpartition('/')[0] @@ -154,8 +153,8 @@ def _download_file_from_url(self, url: str) -> Path: extra_suffix = '.inc' downloaded_file_path = ( - self._downloaded_dir_path / - f'{md5(url.encode()).hexdigest()[:8]}_{url_path.stem}{extra_stem}{url_path.suffix}{extra_suffix}' + self._downloaded_dir_path / + f'{md5(url.encode()).hexdigest()[:8]}_{url_path.stem}{extra_stem}{url_path.suffix}{extra_suffix}' ) self.logger.debug(f'Downloaded file path: {downloaded_file_path}') @@ -166,18 +165,13 @@ def _download_file_from_url(self, url: str) -> Path: response = urllib.request.urlopen(url, timeout=2) except (urllib.error.HTTPError, urllib.error.URLError) as error: self.logger.error(f'Data is not retrieved with {error}\nURL: {url}') - raise except socket.timeout: self.logger.error(f'socket timed out - URL {url}') - raise else: charset = 'utf-8' - if response.headers.get('Content-Type'): - charset_match = re.search( - r'(^|[\s;])charset=(?P[^\s;]+)', - response.headers['Content-Type'] - ) + if response.headers['Content-Type']: + charset_match = re.search(r'(^|[\s;])charset=(?P[^\s;]+)', response.headers['Content-Type']) if charset_match: charset = charset_match.group('charset') @@ -188,7 +182,7 @@ def _download_file_from_url(self, url: str) -> Path: self._downloaded_dir_path.mkdir(parents=True, exist_ok=True) - # Convert relative paths to absolute links in downloaded content + # The beginning of the block codes for converting relative paths to links dict_new_link = {} regexp_find_link = re.compile(r'\[.+?\]\(.+?\)') regexp_find_path = re.compile(r'\(.+?\)') @@ -202,12 +196,9 @@ def _download_file_from_url(self, url: str) -> Path: else: relative_path = regexp_find_path.findall(line) sub_relative_path = re.findall(r'\[.+?\]', line) - if relative_path and sub_relative_path: - dict_new_link[line] = ( - sub_relative_path[0] + '(' + - url.rpartition('/')[0].replace('raw', 'blob') + '/' + - relative_path[0].partition('(')[2] - ) + dict_new_link[line] = sub_relative_path[0] + '(' + url.rpartition('/')[0].replace('raw', + 'blob') + '/' + \ + relative_path[0].partition('(')[2] for line in dict_new_link: downloaded_content = downloaded_content.replace(line, dict_new_link[line]) @@ -239,18 +230,20 @@ def _sync_repo( self.logger.debug(f'Synchronizing with repo; URL: {repo_url}, revision: {revision}') try: - if not repo_path.exists(): - self.logger.debug(f'Cloning repo {repo_url} to {repo_path}') - - run( - f'git clone {repo_url} {repo_path}', - shell=True, - check=True, - stdout=PIPE, - stderr=STDOUT - ) - else: - self.logger.debug('Repo already exists; pulling from remote') + self.logger.debug(f'Cloning repo {repo_url} to {repo_path}') + + run( + f'git clone {repo_url} {repo_path}', + shell=True, + check=True, + stdout=PIPE, + stderr=STDOUT + ) + + except CalledProcessError as exception: + if repo_path.exists(): + self.logger.debug('Repo already cloned; pulling from remote') + try: run( 'git pull', @@ -260,25 +253,24 @@ def _sync_repo( stdout=PIPE, stderr=STDOUT ) + except CalledProcessError as exception: - self.logger.warning(f"Git pull failed: {exception}") + self.logger.warning(str(exception)) + except Exception as exception: + self.logger.warning(str(exception)) - except CalledProcessError as exception: - self.logger.error(f"Git operation failed: {exception}") - raise + else: + self.logger.error(str(exception)) if revision: - try: - run( - f'git checkout {revision}', - cwd=repo_path, - shell=True, - check=True, - stdout=PIPE, - stderr=STDOUT - ) - except CalledProcessError as exception: - self.logger.warning(f"Failed to checkout revision {revision}: {exception}") + run( + f'git checkout {revision}', + cwd=repo_path, + shell=True, + check=True, + stdout=PIPE, + stderr=STDOUT + ) return repo_path @@ -303,10 +295,12 @@ def _sub(heading): f'Shift heading level to {new_heading_level}, heading content: {heading.group("content")}' ) - if new_heading_level <= 6 and new_heading_level >= 1: + if new_heading_level <= 6: return f'{"#" * new_heading_level} {heading.group("content")}{heading.group("tail")}' + else: self.logger.debug('New heading level is out of range, using bold paragraph text instead of heading') + return f'**{heading.group("content")}**{heading.group("tail")}' return self._heading_pattern.sub(_sub, content) @@ -388,8 +382,6 @@ def _cut_from_position_to_position( ) # First, cut the content from the starting position to the end - from_heading_line = None - from_heading_level = None if from_id: self.logger.debug('Starting point is defined by its ID') @@ -400,44 +392,40 @@ def _cut_from_position_to_position( ) from_anchor_pattern = re.compile( - rf'(?:(?]*))?\>{re.escape(from_id)}<\/anchor\>', - flags=re.MULTILINE + rf'(?:(?]*))?\>{re.escape(from_id)}<\/anchor\>' ) - if from_identified_heading_pattern.search(content): + if from_identified_heading_pattern.findall(content): self.logger.debug('Starting heading with defined ID is found') - parts = from_identified_heading_pattern.split(content, maxsplit=1) - if len(parts) > 1: - result = parts[1] - from_heading_line = from_identified_heading_pattern.search(content).group(0) - from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) - else: - result = '' - elif from_anchor_pattern.search(content): + + result = from_identified_heading_pattern.split(content)[1] + + from_heading_line = from_identified_heading_pattern.findall(content)[0] + from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) + + self.logger.debug(f'Level of starting heading: {from_heading_level}') + + elif from_anchor_pattern.findall(content): self.logger.debug('Starting anchor with defined ID is found') - parts = from_anchor_pattern.split(content, maxsplit=1) - if len(parts) > 1: - result = parts[1] - previous_content = parts[0] - - # Find the last heading before the anchor - last_heading_match = None - for heading_match in self._heading_pattern.finditer(previous_content): - last_heading_match = heading_match - - if last_heading_match: - from_heading_level = len(last_heading_match.group('hashes')) - self.logger.debug(f'Level of previous heading: {from_heading_level}') - else: - from_heading_level = self._find_top_heading_level(result) - self.logger.debug(f'No previous heading found, top level from result: {from_heading_level}') - else: - result = '' + + result = from_anchor_pattern.split(content)[1] + + previous_content = from_anchor_pattern.split(content)[0] + + from_heading_line = None + from_heading_level = None + + for previous_heading_match in self._heading_pattern.finditer(previous_content): + from_heading_level = len(previous_heading_match.group('hashes')) + + self.logger.debug(f'Level of starting heading: {from_heading_level}') + else: self.logger.debug( 'Neither starting heading nor starting anchor is found, ' 'skipping the included content' ) + return '' elif from_heading: @@ -448,38 +436,46 @@ def _cut_from_position_to_position( flags=re.MULTILINE ) - if from_heading_pattern.search(content): + if from_heading_pattern.findall(content): self.logger.debug('Starting heading with defined content is found') - parts = from_heading_pattern.split(content, maxsplit=1) - if len(parts) > 1: - result = parts[1] - from_heading_line = from_heading_pattern.search(content).group(0) - from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) - else: - result = '' + + result = from_heading_pattern.split(content)[1] + + from_heading_line = from_heading_pattern.findall(content)[0] + from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) + + self.logger.debug(f'Level of starting heading: {from_heading_level}') + else: self.logger.debug('Starting heading is not found, skipping the included content') + return '' else: self.logger.debug('Starting point is not defined') content_buffer = StringIO(content) + first_line = content_buffer.readline() if self._heading_pattern.fullmatch(first_line): self.logger.debug('The content starts with heading') + result = content_buffer.read() from_heading_line = first_line from_heading_level = len(self._heading_pattern.match(from_heading_line).group('hashes')) + else: self.logger.debug('The content does not start with heading') + result = content + from_heading_line = None from_heading_level = self._find_top_heading_level(content) self.logger.debug(f'Topmost heading level: {from_heading_level}') # After that, cut the result to the ending position + if to_end: self.logger.debug('Ending point is defined as the end of the document') @@ -497,10 +493,14 @@ def _cut_from_position_to_position( if to_identified_heading_pattern.findall(result): self.logger.debug('Ending heading with defined ID is found') + result = to_identified_heading_pattern.split(result)[0] + elif to_anchor_pattern.findall(result): self.logger.debug('Ending anchor with defined ID is found') + result = to_anchor_pattern.split(result)[0] + else: self.logger.debug('Neither ending heading nor ending anchor is found, cutting to the end') @@ -512,10 +512,11 @@ def _cut_from_position_to_position( flags=re.MULTILINE ) - if to_heading_pattern.search(result): + if to_heading_pattern.findall(result): self.logger.debug('Ending heading with defined content is found') - parts = to_heading_pattern.split(result, maxsplit=1) - result = parts[0] if parts else '' + + result = to_heading_pattern.split(result)[0] + else: self.logger.debug('Ending heading is not found, cutting to the end') @@ -527,32 +528,38 @@ def _cut_from_position_to_position( 'Since starting point is defined, cutting to the next heading of the same level' ) - if from_heading_level: - to_heading_pattern = re.compile( - rf'^\#{{1,{from_heading_level}}}\s+\S+.*$', - flags=re.MULTILINE - ) - parts = to_heading_pattern.split(result, maxsplit=1) - result = parts[0] if parts else '' + to_heading_pattern = re.compile( + rf'^\#{{1,{from_heading_level}}}\s+\S+.*$', + flags=re.MULTILINE + ) + + result = to_heading_pattern.split(result)[0] + else: self.logger.debug( 'Since starting point is not defined, using the whole included content' ) # Finally, take into account the options nohead and sethead + if not nohead and from_heading_line: self.logger.debug( 'Since nohead option is not specified, and the included content starts with heading, ' + 'including starting heading into the output' ) + result = from_heading_line + result - if sethead and from_heading_level: + if sethead: if sethead > 0: self.logger.debug( 'Since sethead option is specified, shifting headings levels in the included content' ) - result = self._shift_headings(result, sethead - from_heading_level) + + result = self._shift_headings( + result, + sethead - from_heading_level + ) return result @@ -594,76 +601,63 @@ def _adjust_links( :param content: Markdown content :param markdown_file_path: Path to the Markdown file containing the content - :param origin_file_path: Path to the original file where the include tag is located :returns: Markdown content with relative internal link paths ''' - - def _resolve_link(link: str, root_path: Path, depth_origin: int) -> str: + def _resolve_link(link, root_path, depth_origin): try: resolved_link = (markdown_file_path.absolute().parent / Path(link)).resolve() resolved_link = resolved_link.relative_to(root_path) resolved_link = '../' * depth_origin + resolved_link.as_posix() return resolved_link except Exception as exception: - self.logger.debug(f'An error {exception} occurred when resolving the link: {link}') - return link + self.logger.debug( + f'An error {exception} occurred when resolving the link: {link}' + ) def _sub(m): caption = m.group('text') link = m.group('path') anchor = '' - - # Split link and anchor link_array = m.group('path').split('#') if len(link_array) > 1: link = link_array[0] anchor = f'#{link_array[1]}' - root_path = self.project_path.absolute() / self.tmp_dir - - # Skip absolute paths and external URLs - if Path(link).is_absolute() or link.startswith(('http://', 'https://', 'ftp://')): - return f'[{caption}]({link}{anchor})' - - extension = Path(link).suffix - - try: - origin_rel = origin_file_path.relative_to(root_path) - depth_origin = len(origin_rel.parts) - depth_markdown_file = len(markdown_file_path.relative_to(root_path).parts) - depth_difference = depth_origin - depth_markdown_file - - if extension == ".md": - link = _resolve_link(link, root_path, depth_origin - 1) - elif extension == "": - if depth_origin >= depth_markdown_file: - link = '../' * depth_difference + link - else: - link_split = link.split('/') - if link_split and link_split[0] == '..': - if link_split[-1] == '': - link_split = link_split[:-1] - link_split = link_split[1:] - link = f"{'/'.join(link_split)}.md" - link = _resolve_link(link, root_path, depth_origin) - - # Check if link points to the same file (without anchor) - if (depth_difference == 0 and - Path(Path(link).name).with_suffix('').as_posix() == - Path(origin_rel.name).with_suffix('').as_posix()): - link = '' - - self.logger.debug( - f'Updating link reference; user specified path: {m.group("path")}, ' + - f'resolved path: {link}' - ) - - except Exception as exception: - self.logger.debug( - f'An error {exception} occurred when resolving the link: {m.group("path")}' - ) - link = m.group('path') + if not Path(link).is_absolute(): + extension = Path(link).suffix + try: + origin_rel = origin_file_path.relative_to(root_path) + depth_origin = len(origin_rel.parts) + depth_markdown_file = len(markdown_file_path.relative_to(root_path).parts) + depth_difference = depth_origin - depth_markdown_file + if extension == ".md": + link = _resolve_link(link, root_path, depth_origin - 1) + elif extension == "": + if depth_origin >= depth_markdown_file: + link = '../' * depth_difference + link + else: + link_split = link.split('/') + if link_split[0] == '..': + if link_split[-1] == '': + link_split = link_split[:-1] + link_split = link_split[1:] + link = f"{'/'.join(link_split)}.md" + link = _resolve_link(link, root_path, depth_origin) + if ( + depth_difference == 0 + ) and ( + Path(Path(link).name).with_suffix('').as_posix() == Path(origin_rel.name).with_suffix('').as_posix() + ): + link = '' + self.logger.debug( + f'Updating link reference; user specified path: {m.group("path")}, ' + + f'absolute path: {link}' + ) + except Exception as exception: + self.logger.debug( + f'An error {exception} occurred when resolving the link: {m.group("path")}' + ) return f'[{caption}]({link}{anchor})' @@ -747,9 +741,9 @@ def _get_src_file_path( ) path_mapped_to_src_dir = ( - self.project_path.resolve() / - self.config['src_dir'] / - path_relative_to_working_dir + self.project_path.resolve() / + self.config['src_dir'] / + path_relative_to_working_dir ) self.logger.debug( @@ -777,14 +771,14 @@ def _get_included_file_path( self.logger.debug(f'Currently processed Markdown file: {current_processed_file_path}') - included_file_path = (current_processed_file_path.parent / Path(user_specified_path)).resolve() + included_file_path = (current_processed_file_path.parent / user_specified_path).resolve() self.logger.debug(f'User-specified included file path: {included_file_path}') if ( - self.working_dir.resolve() in current_processed_file_path.parents - and - self.working_dir.resolve() not in included_file_path.parents + self.working_dir.resolve() in current_processed_file_path.parents + and + self.working_dir.resolve() not in included_file_path.parents ): self.logger.debug( 'Currently processed file is located inside the working dir, ' + @@ -794,11 +788,13 @@ def _get_included_file_path( ) included_file_path = ( - self._get_src_file_path(current_processed_file_path).parent / Path(user_specified_path) + self._get_src_file_path(current_processed_file_path).parent / user_specified_path ).resolve() else: - self.logger.debug('Using these paths without changes') + self.logger.debug( + 'Using these paths without changes' + ) self.logger.debug(f'Finally, included file path: {included_file_path}') @@ -947,7 +943,7 @@ def _process_include( strip the top heading, set heading level. :param included_file_path: Path to the included file - :param project_root_path: Path to the "root" directory of Foliant project + :param project_root_path: Path to the “root” directory of Foliant project that the currently processed Markdown file belongs to :param from_heading: Include starting from this heading :param to_heading: Include up to this heading (not including the heading itself) @@ -974,17 +970,18 @@ def _process_include( if self.options['allow_failure']: self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') - path_error_link = Path(self.project_path / '.error_link').resolve() + path_error_link = Path(self.project_path/'.error_link').resolve() + + if not Path(path_error_link).exists(): + Path(path_error_link).mkdir() - if not path_error_link.exists(): - path_error_link.mkdir(parents=True) + path_error_file = open(path_error_link/included_file_path.name, 'w+', encoding='utf8') - path_error_file = path_error_link / included_file_path.name - with open(path_error_file, 'w+', encoding='utf8') as f: - if self.options['stub_text']: - f.write(f'The url or repo_url link is not correct, file not found: {included_file_path}') + if self.options['stub_text']: + path_error_file.write(f'The url or repo_url link is not correct, file not found: {included_file_path}') + path_error_file.close() - included_file_path = path_error_file + included_file_path = path_error_link/included_file_path.name else: self.logger.error(f'The url or repo_url link is not correct, file not found: {included_file_path}') return '', anchors @@ -992,7 +989,7 @@ def _process_include( with open(included_file_path, encoding='utf8') as included_file: included_content = included_file.read() - # Convert relative paths to absolute links for URL includes + # The beginning of the block codes for converting relative paths to links if include_link: dict_new_link = {} regexp_find_link = re.compile(r'\[.+?\]\(.+?\)') @@ -1009,15 +1006,12 @@ def _process_include( continue else: sub_relative_path = re.findall(r'\[.+?\]', line) - if sub_relative_path and relative_path: - dict_new_link[line] = ( - sub_relative_path[0] + '(' + - include_link.rpartition('/')[0].replace('raw', 'blob') + '/' + - relative_path[0].partition('(')[2] - ) + dict_new_link[line] = sub_relative_path[0] + '(' + include_link.rpartition('/')[0].replace( + 'raw', 'blob') + '/' + relative_path[0].partition('(')[2] for line in dict_new_link: included_content = included_content.replace(line, dict_new_link[line]) + # End of the conversion code block # Removing metadata from content before including included_content = remove_meta(included_content) @@ -1114,7 +1108,7 @@ def _add_anchors(self, anchor_list: list, content: str) -> list: return anchor_list def clean_tokens(self, url: str) -> str: - """Remove authentication tokens from URLs.""" + """Remove tokens from URLs.""" token_pattern = r"(https*://)(.*)@(.*)" s = url if self.enable_clean_tokens: @@ -1132,7 +1126,10 @@ def _prepare_path_for_includes_map(self, path: Path) -> str: _path = path.relative_to(getcwd()) if _path.as_posix().startswith(self.working_dir.as_posix()): _path = _path.relative_to(self.working_dir) - donor_path = f"{self.src_dir}/{_path.as_posix()}" + if _path.as_posix().startswith(self.working_dir.as_posix()): + donor_path = f"{self.src_dir}/{_path.relative_to(self.working_dir).as_posix()}" + else: + donor_path = f"{self.src_dir}/{_path.as_posix()}" else: donor_path = _path.as_posix() return donor_path @@ -1305,7 +1302,7 @@ def process_includes( :param markdown_file_path: Path to currently processed Markdown file :param content: Markdown content - :param project_root_path: Path to the "root" directory of Foliant project + :param project_root_path: Path to the “root” directory of Foliant project that the currently processed Markdown file belongs to :param sethead: Level of the topmost heading in the content, it may be set when the method is called recursively @@ -1336,8 +1333,9 @@ def process_includes( include_statement = self.pattern.fullmatch(content_part) if include_statement: - donor_md_path = None - donor_anchors = [] + if self.includes_map_enable: + donor_md_path = None + donor_anchors = [] current_project_root_path = project_root_path @@ -1359,6 +1357,7 @@ def process_includes( if options.get('sethead'): if current_sethead: current_sethead += options['sethead'] - 1 + else: current_sethead = options['sethead'] @@ -1386,7 +1385,7 @@ def process_includes( > """ - if body and body.group('path'): + if body: self.logger.debug('Using the legacy syntax rules') if body.group('repo'): @@ -1401,6 +1400,7 @@ def process_includes( if '#' in repo_from_alias: repo_url, revision = repo_from_alias.split('#', maxsplit=1) + else: repo_url = repo_from_alias @@ -1409,6 +1409,7 @@ def process_includes( if body.group('revision'): revision = body.group('revision') + self.logger.debug( f'Highest priority revision specified in the include statement: {revision}' ) @@ -1416,16 +1417,15 @@ def process_includes( self.logger.debug(f'Repo URL: {repo_url}, revision: {revision}') repo_path = self._sync_repo(repo_url, revision) + self.logger.debug(f'Local path of the repo: {repo_path}') included_file_path = repo_path / body.group('path') if self.includes_map_enable: - include_link = self.create_full_link(repo_url, revision, body.group('path')) - donor_md_path = include_link + body.group('path') + donor_md_path = included_file_path.as_posix() donor_md_path = self.clean_tokens(donor_md_path) self.logger.debug(f'Set the repo URL of the included file to {recipient_md_path}: {donor_md_path} (1)') - if included_file_path.name.startswith('^'): included_file_path = self._find_file( included_file_path.name[1:], included_file_path.parent @@ -1434,7 +1434,7 @@ def process_includes( self.logger.debug(f'Resolved path to the included file: {included_file_path}') current_project_root_path = ( - repo_path / options.get('project_root', '') + repo_path / options.get('project_root', '') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1466,7 +1466,7 @@ def process_includes( if options.get('project_root'): current_project_root_path = ( - markdown_file_path.parent / options.get('project_root') + markdown_file_path.parent / options.get('project_root') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1489,27 +1489,26 @@ def process_includes( if self.includes_map_enable and self.includes_map_anchors: donor_anchors = donor_anchors + anchors - else: # if body is missing or empty + else: # if body is missing self.logger.debug('Using the new syntax rules') if options.get('repo_url') and options.get('path'): self.logger.debug('File in Git repository referenced') repo_path = self._sync_repo(options.get('repo_url'), options.get('revision')) + self.logger.debug(f'Local path of the repo: {repo_path}') included_file_path = repo_path / options['path'] + self.logger.debug(f'Resolved path to the included file: {included_file_path}') current_project_root_path = ( - repo_path / options.get('project_root', '') + repo_path / options.get('project_root', '') ).resolve() - include_link = self.create_full_link( - options.get('repo_url'), - options.get('revision'), - options.get('path') - ) + include_link = self.create_full_link(options.get('repo_url'), options.get('revision'), + options.get('path')) self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1539,11 +1538,12 @@ def process_includes( self.logger.debug('File to get by URL referenced') included_file_path = self._download_file_from_url(options['url']) + self.logger.debug(f'Resolved path to the included file: {included_file_path}') if options.get('project_root'): current_project_root_path = ( - markdown_file_path.parent / options.get('project_root') + markdown_file_path.parent / options.get('project_root') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1581,7 +1581,7 @@ def process_includes( if options.get('project_root'): current_project_root_path = ( - markdown_file_path.parent / options.get('project_root') + markdown_file_path.parent / options.get('project_root') ).resolve() self.logger.debug(f'Set new current project root path: {current_project_root_path}') @@ -1610,6 +1610,7 @@ def process_includes( self.logger.warning( 'Neither repo_url+path nor src specified, ignoring the include statement' ) + processed_content_part = '' if self.options['recursive'] and self.pattern.search(processed_content_part): @@ -1628,15 +1629,19 @@ def process_includes( wrapper = '' if wrap_code == 'triple_backticks': self.logger.debug('Wrapping included content as fence code block with triple backticks') + wrapper = '```' + elif wrap_code == 'triple_tildas': self.logger.debug('Wrapping included content as fence code block with triple tildas') + wrapper = '~~~' code_language = options.get('code_language', '') if code_language: self.logger.debug(f'Specifying code language: {code_language}') + else: self.logger.debug('Do not specify code language') @@ -1644,22 +1649,23 @@ def process_includes( processed_content_part += '\n' processed_content_part = ( - f'{wrapper}{code_language}\n{processed_content_part}{wrapper}\n' + f'{wrapper}{code_language}' + '\n' + processed_content_part + wrapper + '\n' ) elif wrap_code == 'single_backticks': self.logger.debug('Wrapping included content as inline code with single backticks') + processed_content_part = '`' + processed_content_part + '`' if options.get('inline'): self.logger.debug( 'Processing included content part as inline, multiple lines will be stretched into one' ) + processed_content_part = re.sub(r'\s+', ' ', processed_content_part).strip() if self.includes_map_enable: if donor_md_path: - # Only add to includes_map if the recipient file is in chapters list if recipient_md_path in self.chapters or "index.md" in recipient_md_path: if not self._exist_in_includes_map(self.includes_map, recipient_md_path): if not self.includes_map_anchors or len(donor_anchors) == 0: @@ -1669,8 +1675,7 @@ def process_includes( for i, f in enumerate(self.includes_map): if f['file'] == recipient_md_path: - if donor_md_path not in self.includes_map[i]['includes']: - self.includes_map[i]['includes'].append(donor_md_path) + self.includes_map[i]['includes'].append(donor_md_path) if self.includes_map_anchors: if 'anchors' not in self.includes_map[i]: @@ -1678,8 +1683,6 @@ def process_includes( for anchor in donor_anchors: if anchor not in self.includes_map[i]['anchors']: self.includes_map[i]['anchors'].append(anchor) - else: - self.logger.debug(f'File {recipient_md_path} is not in chapters, skipping includes_map') else: processed_content_part = content_part @@ -1710,13 +1713,12 @@ def _get_source_files_extensions(self) -> list: if not md_involved: self.logger.warning( "Markdown file extension 'md' is not mentioned in the extensions list! " + - "Didn't you forget to put it there?" + "Didn’t you forget to put it there?" ) return source_files_extensions def apply(self): - """Apply the preprocessor to all source files.""" self.logger.info('Applying preprocessor') @@ -1737,9 +1739,6 @@ def apply(self): # Get relative path from src_dir rel_path = source_file_path.relative_to(src_dir_path) - # Check if this file is in the working directory (copied) - working_file_path = self.working_dir / rel_path - # Read content from source file with open(source_file_path, encoding='utf8') as source_file: source_content = source_file.read() diff --git a/hell 3.9.19 b/hell 3.9.19 deleted file mode 100644 index a55457f..0000000 --- a/hell 3.9.19 +++ /dev/null @@ -1,378 +0,0 @@ -diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py -index 9277324..26d6bc4 100644 ---- a/foliant/preprocessors/includes.py -+++ b/foliant/preprocessors/includes.py -@@ -804,6 +804,130 @@ class Preprocessor(BasePreprocessor): -  - return included_file_path -  -+ def _read_source_file_content( -+ self, -+ file_path: Path -+ ) -> str: -+ '''Read content from source file, handling both temporary and source directory paths. -+  -+ :param file_path: Path to the file to read -+  -+ :returns: File content as string -+ ''' -+  -+ self.logger.debug(f'Reading source file: {file_path}') -+  -+ # If the file is located in a temporary directory, let's try to find the corresponding source file -+ if self.working_dir.resolve() in file_path.parents: -+ # This is a file in a temporary directory -+ try: -+ # Get the path to the source file -+ src_file_path = self._get_src_file_path(file_path) -+ self.logger.debug(f'Mapping temporary file to source file: {src_file_path}') -+  -+ if src_file_path.exists(): -+ with open(src_file_path, encoding='utf8') as src_file: -+ return src_file.read() -+ else: -+ # If the source file is not found, we read from the temporary file -+ self.logger.debug('Source file not found, reading from temporary file') -+ if file_path.exists(): -+ with open(file_path, encoding='utf8') as temp_file: -+ return temp_file.read() -+ else: -+ self.logger.warning(f'File not found: {file_path}') -+ return '' -+ except Exception as e: -+ self.logger.debug(f'Error mapping to source file: {e}, reading from temporary file') -+ if file_path.exists(): -+ with open(file_path, encoding='utf8') as temp_file: -+ return temp_file.read() -+ else: -+ self.logger.warning(f'File not found: {file_path}') -+ return '' -+ else: -+ # The file is not in the temporary directory, we read it directly -+ if file_path.exists(): -+ with open(file_path, encoding='utf8') as src_file: -+ return src_file.read() -+ else: -+ self.logger.warning(f'File not found: {file_path}') -+ return '' -+ -+ def _has_not_build_meta(self, content: str) -> bool: -+ '''Check if content has not_build: true in front matter. -+  -+ :param content: File content -+  -+ :returns: True if file has not_build: true in metadata -+ ''' -+ # Simple check for front matter with not_build: true -+ front_matter_pattern = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL | re.MULTILINE) -+ match = front_matter_pattern.match(content) -+  -+ if match: -+ front_matter = match.group(1) -+ # Check for not_build: true -+ not_build_pattern = re.compile(r'not_build\s*:\s*true', re.IGNORECASE) -+ return bool(not_build_pattern.search(front_matter)) -+  -+ return False -+ -+ def _process_include_for_includes_map( -+ self, -+ included_file_path: Path, -+ from_heading: str or None = None, -+ to_heading: str or None = None, -+ from_id: str or None = None, -+ to_id: str or None = None, -+ to_end: bool = False -+ ) -> (str, list): -+ '''Process include statement specifically for includes_map generation. -+ Reads content from source files directly, not from temporary directory. -+  -+ :param included_file_path: Path to the included file -+ :param from_heading: Include starting from this heading -+ :param to_heading: Include up to this heading -+ :param from_id: Include starting from the heading or the anchor that has this ID -+ :param to_id: Include up to the heading or the anchor that has this ID -+ :param to_end: Flag that tells to cut to the end of document -+  -+ :returns: Tuple of (included file content, list of anchors) -+ ''' -+  -+ self.logger.debug(f'Processing include for includes_map: {included_file_path}') -+  -+ anchors = [] -+  -+ # Reading the contents of the file from the source directory -+ content = self._read_source_file_content(included_file_path) -+  -+ if not content: -+ return '', anchors -+  -+ # Check if the file has not_build: true -+ if self._has_not_build_meta(content): -+ self.logger.debug(f'File {included_file_path} has not_build: true, using original content') -+  -+ # Removing metadata from content -+ content = remove_meta(content) -+  -+ # Cut content based on parameters -+ content = self._cut_from_position_to_position( -+ content, -+ from_heading, -+ to_heading, -+ from_id, -+ to_id, -+ to_end -+ ) -+  -+ # Find anchors -+ if self.includes_map_anchors: -+ anchors = self._add_anchors(anchors, content) -+  -+ return content, anchors -+ - def _process_include( - self, - included_file_path: Path, -@@ -1020,6 +1144,156 @@ class Preprocessor(BasePreprocessor): - return True - return False -  -+ def process_includes_for_map( -+ self, -+ markdown_file_path: Path, -+ content: str, -+ recipient_md_path: str -+ ) -> None: -+ '''Process includes specifically for includes_map generation. -+ This method only collects includes information without modifying content. -+  -+ :param markdown_file_path: Path to currently processed Markdown file -+ :param content: Markdown content -+ :param recipient_md_path: Path to the file in source directory -+ ''' -+  -+ self.logger.debug(f'Processing includes for map: {markdown_file_path}') -+  -+ include_statement_pattern = re.compile( -+ rf'((?]*)?\>.*?\<\/(?:{"|".join(self.tags)})\>)', -+ flags=re.DOTALL -+ ) -+ -+ content_parts = include_statement_pattern.split(content) -+ -+ for content_part in content_parts: -+ include_statement = self.pattern.fullmatch(content_part) -+ -+ if include_statement: -+ donor_md_path = None -+ donor_anchors = [] -+ -+ body = self._tag_body_pattern.match(include_statement.group('body').strip()) -+ options = self.get_options(include_statement.group('options')) -+ -+ if body and body.group('path'): -+ if body.group('repo'): -+ # File in Git repository -+ repo_from_alias = self.options['aliases'].get(body.group('repo')) -+ -+ revision = None -+ -+ if repo_from_alias: -+ if '#' in repo_from_alias: -+ repo_url, revision = repo_from_alias.split('#', maxsplit=1) -+ else: -+ repo_url = repo_from_alias -+ else: -+ repo_url = body.group('repo') -+ -+ if body.group('revision'): -+ revision = body.group('revision') -+ -+ # Create link to repository file -+ include_link = self.create_full_link(repo_url, revision, body.group('path')) -+ donor_md_path = include_link + body.group('path') -+ donor_md_path = self.clean_tokens(donor_md_path) -+ -+ # Process include for anchors -+ _, anchors = self._process_include_for_includes_map( -+ included_file_path=Path('/dummy/path'), # dummy path for repo files -+ from_heading=body.group('from_heading'), -+ to_heading=body.group('to_heading') -+ ) -+  -+ if self.includes_map_anchors: -+ donor_anchors = donor_anchors + anchors -+ -+ else: -+ # Local file -+ included_file_path = self._get_included_file_path(body.group('path'), markdown_file_path) -+ donor_md_path = self._prepare_path_for_includes_map(included_file_path) -+ donor_md_path = self.clean_tokens(donor_md_path) -+ -+ # Process include for anchors (reading from source file) -+ _, anchors = self._process_include_for_includes_map( -+ included_file_path=included_file_path, -+ from_heading=body.group('from_heading'), -+ to_heading=body.group('to_heading') -+ ) -+  -+ if self.includes_map_anchors: -+ donor_anchors = donor_anchors + anchors -+ -+ else: # if body is missing or empty -+ if options.get('repo_url') and options.get('path'): -+ # File in Git repository -+ include_link = self.create_full_link( -+ options.get('repo_url'),  -+ options.get('revision'), -+ options.get('path') -+ ) -+ donor_md_path = include_link + options.get('path') -+ donor_md_path = self.clean_tokens(donor_md_path) -+ -+ # Process include for anchors -+ _, anchors = self._process_include_for_includes_map( -+ included_file_path=Path('/dummy/path'), # dummy path for repo files -+ from_heading=options.get('from_heading'), -+ to_heading=options.get('to_heading'), -+ from_id=options.get('from_id'), -+ to_id=options.get('to_id'), -+ to_end=options.get('to_end') -+ ) -+  -+ if self.includes_map_anchors: -+ donor_anchors = donor_anchors + anchors -+ -+ elif options.get('url'): -+ # File from URL -+ donor_md_path = options['url'] -+ donor_md_path = self.clean_tokens(donor_md_path) -+ -+ elif options.get('src'): -+ # Local file -+ included_file_path = self._get_included_file_path(options.get('src'), markdown_file_path) -+ donor_md_path = self._prepare_path_for_includes_map(included_file_path) -+ donor_md_path = self.clean_tokens(donor_md_path) -+ -+ # Process include for anchors (reading from source file) -+ _, anchors = self._process_include_for_includes_map( -+ included_file_path=included_file_path, -+ from_heading=options.get('from_heading'), -+ to_heading=options.get('to_heading'), -+ from_id=options.get('from_id'), -+ to_id=options.get('to_id'), -+ to_end=options.get('to_end') -+ ) -+  -+ if self.includes_map_anchors: -+ donor_anchors = donor_anchors + anchors -+ -+ # Add to includes_map -+ if donor_md_path and (recipient_md_path in self.chapters or "index.md" in recipient_md_path): -+ if not self._exist_in_includes_map(self.includes_map, recipient_md_path): -+ if not self.includes_map_anchors or len(donor_anchors) == 0: -+ self.includes_map.append({'file': recipient_md_path, "includes": []}) -+ else: -+ self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) -+ -+ for i, f in enumerate(self.includes_map): -+ if f['file'] == recipient_md_path: -+ if donor_md_path not in self.includes_map[i]['includes']: -+ self.includes_map[i]['includes'].append(donor_md_path) -+ -+ if self.includes_map_anchors: -+ if 'anchors' not in self.includes_map[i]: -+ self.includes_map[i]['anchors'] = [] -+ for anchor in donor_anchors: -+ if anchor not in self.includes_map[i]['anchors']: -+ self.includes_map[i]['anchors'].append(anchor) -+ - def process_includes( - self, - markdown_file_path: Path, -@@ -1147,7 +1421,8 @@ class Preprocessor(BasePreprocessor): - included_file_path = repo_path / body.group('path') -  - if self.includes_map_enable: -- donor_md_path = included_file_path.as_posix() -+ include_link = self.create_full_link(repo_url, revision, body.group('path')) -+ donor_md_path = include_link + body.group('path') - donor_md_path = self.clean_tokens(donor_md_path) - self.logger.debug(f'Set the repo URL of the included file to {recipient_md_path}: {donor_md_path} (1)') -  -@@ -1451,25 +1726,36 @@ class Preprocessor(BasePreprocessor): -  - source_files_extensions = self._get_source_files_extensions() -  -- # First pass: collect includes_map for all files (even not_build ones) -+ # First pass: collect includes_map for all files from source directory - if self.includes_map_enable: -- self.logger.debug('First pass: collecting includes_map') -- # We need to process all files to build includes_map -+ self.logger.debug('First pass: collecting includes_map from source files') -+  -+ # Process source directory files for includes_map -+ src_dir_path = self.project_path / self.src_dir - for source_files_extension in source_files_extensions: -- for source_file_path in self.working_dir.rglob(source_files_extension): -+ for source_file_path in src_dir_path.rglob(source_files_extension): -+ # Get relative path from src_dir -+ rel_path = source_file_path.relative_to(src_dir_path) -+  -+ # Check if this file is in the working directory (copied) -+ working_file_path = self.working_dir / rel_path -+  -+ # Read content from source file - with open(source_file_path, encoding='utf8') as source_file: - source_content = source_file.read() -- -- # Process includes just for includes_map collection -- # Don't write the result back yet -- self.process_includes( -+  -+ # Determine recipient path for includes_map -+ recipient_md_path = f'{self.src_dir}/{rel_path.as_posix()}' -+  -+ # Process includes for map collection -+ self.process_includes_for_map( - source_file_path, - source_content, -- self.project_path.resolve() -+ recipient_md_path - ) -  -- # Second pass: actually process files -- self.logger.debug('Second pass: processing includes') -+ # Second pass: process files in working directory -+ self.logger.debug('Second pass: processing includes in working directory') - for source_files_extension in source_files_extensions: - for source_file_path in self.working_dir.rglob(source_files_extension): - with open(source_file_path, encoding='utf8') as source_file: -@@ -1485,23 +1771,23 @@ class Preprocessor(BasePreprocessor): - with open(source_file_path, 'w', encoding='utf8') as processed_file: - processed_file.write(processed_content) -  -- # Write includes map -+ # Write includes map (sort data for consistent output) - if self.includes_map_enable: - output = f'{self.working_dir}/static/includes_map.json' - Path(f'{self.working_dir}/static/').mkdir(parents=True, exist_ok=True) -+  -+ # Sort includes_map for consistent output - def sort_includes_map(data): - if isinstance(data, list): -- # Sorting includes and anchors in each element - for item in data: - if isinstance(item, dict): - if 'includes' in item and isinstance(item['includes'], list): - item['includes'].sort() - if 'anchors' in item and isinstance(item['anchors'], list): - item['anchors'].sort() -- # Sorting the entire list by the 'file' field - data.sort(key=lambda x: x.get('file', '')) - return data -- -+  - sorted_includes_map = sort_includes_map(self.includes_map) -  - with open(output, 'w', encoding='utf8') as f: From 9152dec61c75144456723232eab0b10cc60d5bff Mon Sep 17 00:00:00 2001 From: Timur Osmanov Date: Wed, 14 Jan 2026 15:41:00 +0300 Subject: [PATCH 8/8] bump: version --- changelog.md | 4 ++++ setup.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index ad16bd1..d12aaf0 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,7 @@ +# 1.1.21 + +- Fix: the creation of an includes map when using the `only_partial` argument. + # 1.1.20 - Fix: bug where the `_adjust_links` duplicated the anchor. diff --git a/setup.py b/setup.py index 045dd20..c757cbe 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ description=SHORT_DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', - version='1.1.20', + version='1.1.21', author='Konstantin Molchanov', author_email='moigagoo@live.com', url='https://github.com/foliant-docs/foliantcontrib.includes',