[Confluence] Fix pagination for get_all_* methods and unify _get_paged across Cloud/Server

Zircoz · claude · Zircoz · commit ea2fa33449b4 · 2026-02-16T00:06:07.000+05:30
Fixes #1598 - Switch 10 get_all_* methods to use _get_paged for full pagination - Unify _get_paged into ConfluenceBase (remove Cloud/Server duplicates) - Handle _links.next as both string and dict formats - Fix relative pagination URLs by prepending base URL correctly - Fix Cloud api_root from wiki/api/v2 to wiki/rest/api (endpoints use v1 paths) - Recognize api.atlassian.com in Cloud detection; support explicit cloud= kwarg - Add routing tests and pagination edge-case tests for both Cloud and Server Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/atlassian/confluence/__init__.py b/atlassian/confluence/__init__.py
@@ -18,7 +18,11 @@ class Confluence(ConfluenceBase):
 
     def __init__(self, url, *args, **kwargs):
         # Detect which implementation to use
-        if ("atlassian.net" in url or "jira.com" in url) and ("/wiki" not in url):
+        # Priority: explicit cloud= kwarg > URL-based heuristic
+        is_cloud = kwargs.get("cloud")
+        if is_cloud is None:
+            is_cloud = "atlassian.net" in url or "jira.com" in url or "api.atlassian.com" in url
+        if is_cloud:
             impl = ConfluenceCloud(url, *args, **kwargs)
         else:
             impl = ConfluenceServer(url, *args, **kwargs)
diff --git a/atlassian/confluence/base.py b/atlassian/confluence/base.py
@@ -134,27 +134,39 @@ def _get_paged(
 
             yield from response.get("results", [])
 
-            if self.cloud:
-                url = response.get("_links", {}).get("next", {}).get("href")
-                if url is None:
-                    break
-                # From now on we have absolute URLs with parameters
-                absolute = True
-                # Params are now provided by the url
-                params = {}
-                # Trailing should not be added as it is already part of the url
-                trailing = False
+            next_link = response.get("_links", {}).get("next")
+            if next_link is None:
+                break
+            if isinstance(next_link, str):
+                url = next_link
             else:
-                if response.get("_links", {}).get("next") is None:
-                    break
-                # For server, we need to extract the next page URL from the _links.next.href
-                next_url = response.get("_links", {}).get("next", {}).get("href")
-                if next_url is None:
-                    break
-                url = next_url
-                absolute = True
-                params = {}
-                trailing = False
+                url = next_link.get("href")
+            if url is None:
+                break
+
+            if url.startswith("/"):
+                # Prepend base URL from self.url, stripping the API root suffix to preserve path prefix
+                # Example: self.url = "https://api.atlassian.com/ex/confluence/abc/wiki/rest/api"
+                #          api_root = "wiki/rest/api"
+                #          base = "https://api.atlassian.com/ex/confluence/abc"
+                #          relative = "/rest/api/content?cursor=1"
+                #          result = "https://api.atlassian.com/ex/confluence/abc/rest/api/content?cursor=1"
+                api_root_suffix = f"/{self.api_root}"
+                if self.url.endswith(api_root_suffix):
+                    base = self.url[:-len(api_root_suffix)]
+                else:
+                    # Fallback: extract scheme+netloc if api_root suffix not found
+                    from urllib.parse import urlparse
+                    parsed = urlparse(self.url)
+                    base = f"{parsed.scheme}://{parsed.netloc}"
+                url = base + url
+
+            # From now on we have absolute URLs with parameters
+            absolute = True
+            # Params are now provided by the url
+            params = {}
+            # Trailing should not be added as it is already part of the url
+            trailing = False
 
         return
 
diff --git a/atlassian/confluence/cloud/__init__.py b/atlassian/confluence/cloud/__init__.py
@@ -13,10 +13,10 @@ def __init__(self, url="https://api.atlassian.com/", *args, **kwargs):
         if "cloud" not in kwargs:
             kwargs["cloud"] = True
         if "api_version" not in kwargs:
-            kwargs["api_version"] = "2"
+            kwargs["api_version"] = "latest"
         if "api_root" not in kwargs:
-            kwargs["api_root"] = "wiki/api/v2"
-        url = url.strip("/")
+            kwargs["api_root"] = "wiki/rest/api"
+        url = url.strip("/") + f"/{kwargs['api_root']}"
         super(Cloud, self).__init__(url, *args, **kwargs)
 
     # Content Management
@@ -28,6 +28,14 @@ def get_content_by_type(self, content_type, **kwargs):
         """Get content by type (page, blogpost, etc.)."""
         return self.get("content", params={"type": content_type, **kwargs})
 
+    def get_all_pages_from_space(self, space_key, **kwargs):
+        """Get all pages from space."""
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", **kwargs})
+
+    def get_all_blog_posts_from_space(self, space_key, **kwargs):
+        """Get all blog posts from space."""
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
+
     def create_content(self, data, **kwargs):
         """Create new content."""
         return self.post("content", data=data, **kwargs)
diff --git a/atlassian/confluence/cloud/base.py b/atlassian/confluence/cloud/base.py
@@ -24,53 +24,4 @@ def __init__(self, url, *args, **kwargs):
         """
         super(ConfluenceCloudBase, self).__init__(url, *args, **kwargs)
 
-    def _get_paged(
-        self,
-        url,
-        params=None,
-        data=None,
-        flags=None,
-        trailing=None,
-        absolute=False,
-    ):
-        """
-        Used to get the paged data for Confluence Cloud
-
-        :param url: string:                        The url to retrieve
-        :param params: dict (default is None):     The parameter's
-        :param data: dict (default is None):       The data
-        :param flags: string[] (default is None):  The flags
-        :param trailing: bool (default is None):   If True, a trailing slash is added to the url
-        :param absolute: bool (default is False):  If True, the url is used absolute and not relative to the root
-
-        :return: A generator object for the data elements
-        """
-        if params is None:
-            params = {}
-
-        while True:
-            response = self.get(
-                url,
-                trailing=trailing,
-                params=params,
-                data=data,
-                flags=flags,
-                absolute=absolute,
-            )
-            if "results" not in response:
-                return
-
-            yield from response.get("results", [])
-
-            # Confluence Cloud uses _links.next.href for pagination
-            url = response.get("_links", {}).get("next", {}).get("href")
-            if url is None:
-                break
-            # From now on we have absolute URLs with parameters
-            absolute = True
-            # Params are now provided by the url
-            params = {}
-            # Trailing should not be added as it is already part of the url
-            trailing = False
 
-        return
diff --git a/atlassian/confluence/server/__init__.py b/atlassian/confluence/server/__init__.py
@@ -62,11 +62,11 @@ def get_content_by_id(self, content_id, **kwargs):
 
     def get_all_pages_from_space(self, space_key, **kwargs):
         """Get all pages from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "page", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", **kwargs})
 
     def get_all_blog_posts_from_space(self, space_key, **kwargs):
         """Get all blog posts from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
 
     def get_page_by_title(self, space_key, title, **kwargs):
         """Get page by title and space key."""
@@ -195,11 +195,11 @@ def remove_content_label(self, content_id, label_name, **kwargs):
 
     def get_all_pages_by_label(self, label, **kwargs):
         """Get all pages by label."""
-        return self.get("content", params={"label": label, "type": "page", **kwargs})
+        return self._get_paged("content", params={"label": label, "type": "page", **kwargs})
 
     def get_all_blog_posts_by_label(self, label, **kwargs):
         """Get all blog posts by label."""
-        return self.get("content", params={"label": label, "type": "blogpost", **kwargs})
+        return self._get_paged("content", params={"label": label, "type": "blogpost", **kwargs})
 
     # Attachment Management
     def get_attachments(self, content_id, **kwargs):
@@ -293,24 +293,24 @@ def get_draft_content(self, content_id, **kwargs):
 
     def get_all_draft_pages_from_space(self, space_key, **kwargs):
         """Get all draft pages from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "page", "status": "draft", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", "status": "draft", **kwargs})
 
     def get_all_draft_blog_posts_from_space(self, space_key, **kwargs):
         """Get all draft blog posts from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "blogpost", "status": "draft", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", "status": "draft", **kwargs})
 
     # Trash Management
     def get_trash_content(self, space_key, **kwargs):
         """Get trash content."""
-        return self.get("content", params={"spaceKey": space_key, "status": "trashed", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "status": "trashed", **kwargs})
 
     def get_all_pages_from_space_trash(self, space_key, **kwargs):
         """Get all pages from space trash."""
-        return self.get("content", params={"spaceKey": space_key, "type": "page", "status": "trashed", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", "status": "trashed", **kwargs})
 
     def get_all_blog_posts_from_space_trash(self, space_key, **kwargs):
         """Get all blog posts from space trash."""
-        return self.get("content", params={"spaceKey": space_key, "type": "blogpost", "status": "trashed", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", "status": "trashed", **kwargs})
 
     # Export
     def export_content(self, content_id, **kwargs):
diff --git a/atlassian/confluence/server/base.py b/atlassian/confluence/server/base.py
@@ -24,54 +24,4 @@ def __init__(self, url, *args, **kwargs):
         """
         super(ConfluenceServerBase, self).__init__(url, *args, **kwargs)
 
-    def _get_paged(
-        self,
-        url,
-        params=None,
-        data=None,
-        flags=None,
-        trailing=False,
-        absolute=False,
-    ):
-        """
-        Used to get the paged data for Confluence Server
-
-        :param url: string:                        The url to retrieve
-        :param params: dict (default is None):     The parameter's
-        :param data: dict (default is None):       The data
-        :param flags: string[] (default is None):  The flags
-        :param trailing: bool (default is None):   If True, a trailing slash is added to the url
-        :param absolute: bool (default is False):  If True, the url is used absolute and not relative to the root
-
-        :return: A generator object for the data elements
-        """
-        if params is None:
-            params = {}
-
-        while True:
-            response = self.get(
-                url,
-                trailing=trailing,
-                params=params,
-                data=data,
-                flags=flags,
-                absolute=absolute,
-            )
-            if "results" not in response:
-                return
-
-            yield from response.get("results", [])
-
-            # Confluence Server uses _links.next.href for pagination
-            if response.get("_links", {}).get("next") is None:
-                break
-            # For server, we need to extract the next page URL from the _links.next.href
-            next_url = response.get("_links", {}).get("next", {}).get("href")
-            if next_url is None:
-                break
-            url = next_url
-            absolute = True
-            params = {}
-            trailing = False
 
-        return
diff --git a/tests/confluence/test_confluence_cloud.py b/tests/confluence/test_confluence_cloud.py
diff --git a/tests/confluence/test_confluence_routing.py b/tests/confluence/test_confluence_routing.py
diff --git a/tests/confluence/test_confluence_server.py b/tests/confluence/test_confluence_server.py