First working commit for CLI

thegreenwebfoundation · Oct 23, 2024 · 8710802 · 8710802
1 parent c1dcd93
commit 8710802
Show file tree

Hide file tree

Showing 16 changed files with 920 additions and 58 deletions.
diff --git a/README.md b/README.md
@@ -1,30 +1,31 @@
+# Carbon.txt validator
+
+This validator reads carbon.txt files, and validates them against a spec defined on http://carbontxt.org.
+
 
 
 
 # Usage
 
+## With the CLI
 
-Run a validation against a given domain, and see if a) there is a CSRD report, and b) that the datapoints for a greenweb verification exist
+Run a validation against a given domain, or file, say if the file is valid TOML, and it confirms to the carbon.txt spec
 
-```
-carbontxt validate --greenweb-csrd domain.com
-```
+```shell
+# parse the carbon.txt file on default paths on some-domain.com
+carbontxt validate domain some-domain.com
 
-Run a validation against a given domain, and only say if the file is valid TOML, and it confirms to the carbon.txt spec
+# parse a remote file available at https://somedomain.com/path-to-carbon.txt
+carbontxt validate file https://somedomain.com/path-to-carbon.txt
 
-```
-carbontxt validate --syntax-only domain
-carbontxt validate --syntax-only -f ./path-to-file.com # look at the file only
-```
+# parse a local file ./path-to-file.com
+carbontxt validate file ./path-to-file.com
 
-Run a validation against a given domain, and only say if the file is valid TOML, and it confirms to the spec, AND if the links are still live
+# pipe the contents of a file into the file validation command as part of a pipeline
+cat ./path-to-file.com | carbontxt validate file
 
-```
-carbontxt validate --greenweb-csrd --syntax-only --check-links domain
 ```
 
-Run a validation against a given domain, and download the evidence. Take a screenshot of the page if a webpage downloading it and the HTML, (maybe WARCing it), otherwise download the file directly otherwise.
+## With the HTTP API
 
-```
-carbontxt fetch --greenweb-csrd --fetch
-```
+(Coming up next)
diff --git a/docs/readme.md b/docs/readme.md
@@ -0,0 +1,7 @@
+# How carbon.txt is designed.
+
+The carbon.txt validator is split into a series of components, with clear divisions of responsibility
+
+- **Finders**: Finders are responsible for accepting a domain or URI, and resolving it to the final URI where a carbon.txt file is accessible, for fetching and reading.
+- **Parsers**: Parsers are responsible for parsing carbon.txt files, then making sure they valid and conform to the required data schema.
+- **Processors**(s): Processors are responsible for parsing specific kinds of linked documents, and data, and returning a valid data structure.
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,21 +12,27 @@ readme = "README.md"
 requires-python = ">=3.11"
 classifiers = ["License :: OSI Approved :: Apache Software License"]
 dependencies = [
+    "django-ninja>=1.3.0",
     "dnspython>=2.7.0",
     "httpx>=0.27.2",
-    "mypy>=1.12.0",
-    "pytest>=8.3.3",
     "rich>=13.9.2",
     # faster, but not yet supported by python 3.13 yet
     # "rtoml>=0.11.0",
     "typer>=0.12.5",
 ]
 
-
 [project.scripts]
 carbon-txt = "carbon_txt_validator.cli:app"
 
 
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+
+[tool.uv]
+dev-dependencies = [
+    "ipdb>=0.13.13",
+    "mypy>=1.12.0",
+    "pytest-watch>=4.2.0",
+    "pytest>=8.3.3",
+]
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+# pytest.ini
+[pytest]
+pythonpath = src
diff --git a/src/carbon_txt_validator/__init__.py b/src/carbon_txt_validator/__init__.py
@@ -1,2 +0,0 @@
-def hello() -> str:
-    return "Hello from carbon-txt-validator!"

diff --git a/src/carbon_txt_validator/cli.py b/src/carbon_txt_validator/cli.py
@@ -1,54 +1,63 @@
 import typer
-import httpx
-import dns.resolver
-import tomllib as toml
 import rich
 
+from . import finders, parsers_toml
+
 app = typer.Typer()
 validate_app = typer.Typer()
 app.add_typer(validate_app, name="validate")
 
+file_finder = finders.FileFinder()
+parser = parsers_toml.CarbonTxtParser()
+
 
 @validate_app.command("domain")
 def validate_domain(domain: str):
 
+    result = file_finder.resolve_domain(domain)
 
-    dns_record = dns.resolver.resolve(domain, "TXT")
-    url = dns_record[0].strings[0].decode()
+    if result:
+        rich.print(f"Carbon.txt file found at {result}.")
+    else:
+        rich.print(f"No valid carbon.txt file found on {domain}.")
+        return 1
 
-    httpx.get(url)
-        content = response.text
-    data = toml.loads(content)
-    rich.print(data)
-    # Process the data...
+    # fetch and parse file
 
+    content = parser.get_carbon_txt_file(result)
+    # rich.print(content)
+    parsed_result = parser.parse_toml(content)
+    rich.print(parsed_result)
+    vaidation_results = parser.validate_as_carbon_txt(parsed_result)
+    rich.print(vaidation_results)
 
-@validate_app.command("url")
-def validate_url(url: str):
-    breakpoint()
-    with httpx.Client() as client:
-        response = client.get(url)
-        content = response.text
-    data = toml.loads(content)
-    rich.print(data)
-    # Process the data...
+    return 0
 
 
 @validate_app.command("file")
-def validate_file(file_path: str):
-    with open(file_path, "r") as file:
-        content = file.read()
-    data = toml.loads(content)
-    rich.print(data)
-    # Process the data...
-
-
-@validate_app.command("stdin")
-def validate_stdin():
-    content = typer.get_text_stream("stdin").read()
-    data = toml.loads(content)
-    rich.print(data)
-    # Process the data...
+def validate_file(
+    file_path: str = typer.Argument(
+        ..., help="Path to carbon.txt file or '-' to read from STDIN"
+    )
+):
+
+    if file_path == "-":
+        content = typer.get_text_stream("stdin").read()
+    else:
+        result = file_finder.resolve_uri(file_path)
+        if result:
+            rich.print(f"Carbon.txt file found at {result}.")
+            content = parser.get_carbon_txt_file(result)
+        else:
+            rich.print(f"No valid carbon.txt file found at {file_path}.")
+            return 1
+
+    parsed_result = parser.parse_toml(content)
+    vaidation_results = parser.validate_as_carbon_txt(parsed_result)
+    rich.print("-------")
+    rich.print(parsed_result)
+
+    return parsed_result
 
 
 if __name__ == "__main__":

diff --git a/src/carbon_txt_validator/finders.py b/src/carbon_txt_validator/finders.py
@@ -0,0 +1,93 @@
+import httpx
+import dns.resolver
+import tomllib as toml
+from pathlib import Path
+from urllib.parse import urlparse, ParseResult
+from typing import Optional
+import logging
+import rich
+
+logger = logging.getLogger(__name__)
+
+logger.setLevel(logging.DEBUG)
+
+
+class FileFinder:
+    """
+    Responsible for figuring out which URI to fetch
+    a carbon.txt file from.
+    """
+
+    def _parse_uri(self, uri: str) -> Optional[ParseResult]:
+        """
+        Return a parsed URI object if the URI is valid, otherwise return None
+        """
+        parsed_uri = urlparse(uri)
+        if parsed_uri.scheme in ("http", "https"):
+            return parsed_uri
+        return None
+
+    def _lookup_dns(self, domain):
+        """Try a DNS TXT record lookup for the given domain"""
+
+        # look for a TXT record on the domain first
+        # if have it, return that
+        try:
+            dns_record = dns.resolver.resolve(domain, "TXT")
+            if dns_record:
+                url = dns_record[0].strings[0].decode()
+                rich.print(url)
+                return url
+
+        except dns.resolver.NoAnswer:
+            logger.info("No result from TXT lookup")
+            return None
+
+    def resolve_domain(self, domain) -> str:
+        """
+        Accepts a domain, and returns a URI to fetch a carbon.txt file from.
+        """
+
+        uri_from_domain = self._lookup_dns(domain)
+
+        if uri_from_domain:
+            return self._parse_uri(uri_from_domain).geturl()
+
+        # otherwise we look for a carbon.txt file at the root of the domain, then fallback to
+        # one at the .well-known path following the well-known convention
+        default_paths = ["/carbon.txt", "/.well-known/carbon.txt"]
+
+        for url_path in default_paths:
+            uri = urlparse(f"https://{domain}{url_path}")
+            response = httpx.head(uri.geturl())
+            if response.status_code == 200:
+                return uri.geturl()
+
+        # if a path has the 'via' header, it suggests a managed service or proxy
+        # follow that path to fetch the active carbon.txt file
+
+        # TODO allow file on the domain to override the one specificed in a 'via'
+
+    def resolve_uri(self, uri: str) -> str:
+        """
+        Accept a URI pointing to a carbon.txt file, and return the final
+        resolved URI, following any 'via' referrers or similar.
+        """
+
+        # check if the uri looks like one we might reach over HTTP / HTTPS
+        parsed_uri = self._parse_uri(uri)
+
+        # if there is no http or https scheme, we assume a local file
+        if not parsed_uri:
+            path_to_file = Path(uri)
+            return str(path_to_file.resolve())
+
+        response = httpx.head(parsed_uri.geturl())
+        if response.status_code == 200:
+            return parsed_uri.geturl()
+
+        # fallback to doing retry or notifying us if the domain is unreachable
+        # TODO:
+
+        # check if there is a via header
+        # TODO: do we follow multiple 'via' headers? If so, how many hops do we follow before we timeout?
diff --git a/src/carbon_txt_validator/parsers_toml.py b/src/carbon_txt_validator/parsers_toml.py
@@ -0,0 +1,42 @@
+import tomllib as toml
+from . import schemas
+import httpx
+import pathlib
+
+
+class CarbonTxtParser:
+    """
+    Responsible for parsing carbon.txt files, checking
+    they are valid TOML, and that parsed data structures
+    have the expected top level keys and values.
+    """
+
+    def get_carbon_txt_file(self, str) -> str:
+        """
+        Accept a URI and either fetch the file over HTTP(S), or read the local file.
+        Return a string of contents of the remote carbon.txt file, or the local file.
+        """
+        if str.startswith("http"):
+            result = httpx.get(str).text
+            return result
+
+        if pathlib.Path(str).exists():
+            return pathlib.Path(str).read_text()
+
+    def parse_toml(self, str) -> dict:
+        """
+        Accept a string of TOML and return a CarbonTxtFile
+        object
+        """
+        parsed = toml.loads(str)
+        return parsed
+
+    def validate_as_carbon_txt(self, parsed) -> schemas.CarbonTxtFile:
+        """
+        Accept a parsed TOML object and return a CarbonTxtFile, validating that
+        necessary keys are present and values are of the correct type.
+        """
+
+        carb_txt_obj = schemas.CarbonTxtFile(**parsed)
+
+        return carb_txt_obj
diff --git a/src/carbon_txt_validator/schemas.py b/src/carbon_txt_validator/schemas.py
@@ -0,0 +1,61 @@
+from typing import Literal, Optional, List
+
+from pydantic import BaseModel, Field
+
+
+class Provider(BaseModel):
+    """
+    Providers in this context are upstream providers of hosted services.
+    The domain is used as key for looking up a corresponding provider in the
+    Green Web Platform
+    """
+
+    domain: str
+    name: Optional[str] = None
+    services: Optional[List[str]] = None
+
+
+class Credential(BaseModel):
+    """
+    Credentiials are essentially supporting documentation for a claim to be running on
+    green energy.
+    """
+
+    domain: Optional[str] = None
+    doctype: Literal[
+        "web-page", "annual-report", "sustainability-page", "certificate", "other"
+    ]
+    url: str
+
+
+class Organisation(BaseModel):
+    """
+    An Organisation is the entity making the claim to running its infrastructure
+    on green energy. In the very least it should have some credentials point to, even
+    if it is exclusively relying on upstream providers for its green claims.
+    """
+
+    credentials: List[Credential] = Field(..., min_length=1)
+
+
+class Upstream(BaseModel):
+    """
+    Upstream refers to one or more providers of hosted services that the Organisation
+    is relying on to operate a digital service, like running a website, or application.
+    """
+
+    # organisations that don't use third party providers could plausibly have an
+    # empty upstream list. We also either accept providers as a single string representing
+    # a domain, or a dictionary containing the fields defined in the Provider model
+    providers: Optional[List[Provider | str]] = None
+
+
+class CarbonTxtFile(BaseModel):
+    """
+    A carbon.txt file is the data structure that acts as an index for supporting evidence
+    for green claims made by a specific organisation. It is intended to links to
+    machine readable data or supporting documentation in the public domain.
+    """
+
+    upstream: Optional[Upstream] = None
+    org: Organisation
Original file line number	Diff line number	Diff line change
		@@ -1,2 +0,0 @@
		def hello() -> str:
		return "Hello from carbon-txt-validator!"