diff --git a/CHANGELOG.md b/CHANGELOG.md index 5682b27..f7a6515 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to ApplyPilot will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + **Greenhouse ATS support** - New discovery source for 129 AI/ML startups and tech companies using Greenhouse (Scale AI, Stripe, Figma, Notion, etc.). Uses official Greenhouse Job Board API (`boards-api.greenhouse.io`) for reliable, structured data. + **New module**: `src/applypilot/discovery/greenhouse.py` - API-based fetcher with full job descriptions, parallel execution, location filtering, and query matching + **New config**: `src/applypilot/config/greenhouse.yaml` - 129 verified Greenhouse employers organized by category (Core AI, Infrastructure, Fintech, Healthcare, etc.) + **User config override** - Users can extend/modify employers via `~/.applypilot/greenhouse.yaml` + **New CLI commands** - `applypilot greenhouse verify|discover|validate|list-employers|add-job` for managing Greenhouse employers + **Pipeline integration** - Greenhouse fetcher runs automatically during `discover` stage alongside JobSpy, Workday, and SmartExtract + ## [0.2.0] - 2026-02-17 ### Added diff --git a/README.md b/README.md index e7fe08e..0de4be4 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Runs stages 1-5: discovers jobs, scores them, tailors your resume, generates cov | Stage | What Happens | |-------|-------------| -| **1. Discover** | Scrapes 5 job boards (Indeed, LinkedIn, Glassdoor, ZipRecruiter, Google Jobs) + 48 Workday employer portals + 30 direct career sites | +| **1. Discover** | Scrapes 5 job boards (Indeed, LinkedIn, Glassdoor, ZipRecruiter, Google Jobs) + 48 Workday employer portals + 129 Greenhouse ATS employers + 30 direct career sites | | **2. Enrich** | Fetches full job descriptions via JSON-LD, CSS selectors, or AI-powered extraction | | **3. Score** | AI rates every job 1-10 based on your resume and preferences. Only high-fit jobs proceed | | **4. Tailor** | AI rewrites your resume per job: reorganizes, emphasizes relevant experience, adds keywords. Never fabricates | @@ -77,7 +77,7 @@ Each stage is independent. Run them all or pick what you need. | AI scoring | 1-10 fit score per job | Basic filtering | Your gut feeling | | Resume tailoring | Per-job AI rewrite | Template-based | Hours per application | | Auto-apply | Full form navigation + submission | LinkedIn Easy Apply only | Click, type, repeat | -| Supported sites | Indeed, LinkedIn, Glassdoor, ZipRecruiter, Google Jobs, 46 Workday portals, 28 direct sites | LinkedIn | Whatever you open | +| Supported sites | Indeed, LinkedIn, Glassdoor, ZipRecruiter, Google Jobs, 129 Greenhouse employers, 46 Workday portals, 28 direct sites | LinkedIn | Whatever you open | | License | AGPL-3.0 | MIT | N/A | --- @@ -127,7 +127,7 @@ API keys and runtime config: `GEMINI_API_KEY`, `LLM_MODEL`, `CAPSOLVER_API_KEY` ## How Stages Work ### Discover -Queries Indeed, LinkedIn, Glassdoor, ZipRecruiter, Google Jobs via JobSpy. Scrapes 48 Workday employer portals (configurable in `employers.yaml`). Hits 30 direct career sites with custom extractors. Deduplicates by URL. +Queries Indeed, LinkedIn, Glassdoor, ZipRecruiter, Google Jobs via JobSpy. Fetches from 129 Greenhouse ATS employers. Scrapes 48 Workday employer portals (configurable in `employers.yaml`). Hits 30 direct career sites with custom extractors. Deduplicates by URL. ### Enrich Visits each job URL and extracts the full description. 3-tier cascade: JSON-LD structured data, then CSS selector patterns, then AI-powered extraction for unknown layouts. diff --git a/src/applypilot/cli.py b/src/applypilot/cli.py index 6c8be91..7d61e47 100644 --- a/src/applypilot/cli.py +++ b/src/applypilot/cli.py @@ -453,5 +453,10 @@ def doctor() -> None: console.print() + +# Import and add greenhouse subcommand +from applypilot.cli_greenhouse import app as greenhouse_app +app.add_typer(greenhouse_app, name="greenhouse", help="Manage Greenhouse ATS employers") + if __name__ == "__main__": app() diff --git a/src/applypilot/cli_greenhouse/__init__.py b/src/applypilot/cli_greenhouse/__init__.py new file mode 100644 index 0000000..265c339 --- /dev/null +++ b/src/applypilot/cli_greenhouse/__init__.py @@ -0,0 +1,423 @@ +"""Greenhouse CLI commands for managing Greenhouse ATS employers.""" + +from __future__ import annotations + +import re +import time +from pathlib import Path +from typing import List, Optional, Tuple +from urllib.parse import urlparse + +import httpx +import typer +import yaml +from rich.console import Console +from rich.table import Table + +console = Console() + +# API endpoint templates +API_BASE = "https://boards-api.greenhouse.io/v1/boards" +API_TEMPLATE = f"{API_BASE}/{{slug}}/jobs" + +# Known slug fixes +KNOWN_FIXES = {"notion": "notionhq"} + +app = typer.Typer( + name="greenhouse", + help="Manage Greenhouse ATS employers and verify configurations.", + no_args_is_help=True, +) + + +def _load_config(config_path: Optional[Path] = None) -> dict: + """Load greenhouse.yaml configuration.""" + if config_path is None: + # Try user config first, then package config + from applypilot.config import APP_DIR, CONFIG_DIR + + user_path = APP_DIR / "greenhouse.yaml" + if user_path.exists(): + config_path = user_path + else: + config_path = CONFIG_DIR / "greenhouse.yaml" + + if not config_path.exists(): + console.print(f"[red]Config not found:[/red] {config_path}") + raise typer.Exit(code=1) + + with config_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + + return data.get("employers", {}) + + +def _check_slug(slug: str) -> Tuple[bool, Optional[int], Optional[str]]: + """Check if a slug is valid via Greenhouse API.""" + url = API_TEMPLATE.format(slug=slug) + + try: + with httpx.Client(timeout=10.0) as client: + resp = client.get(url, headers={"Accept": "application/json"}) + except httpx.RequestError as e: + return False, None, f"Request error: {e}" + + if resp.status_code == 200: + try: + data = resp.json() + jobs = data.get("jobs", []) + total = len(jobs) + return True, total, None + except ValueError: + return True, None, "Invalid JSON" + elif resp.status_code == 404: + return False, None, "Not found" + elif resp.status_code == 429: + return False, None, "Rate limited" + else: + return False, None, f"HTTP {resp.status_code}" + + +def _generate_variations(name: str) -> List[str]: + """Generate slug variations for a company name.""" + name = name.lower().strip() + variations = [name] + + # No spaces + no_spaces = name.replace(" ", "") + if no_spaces != name: + variations.append(no_spaces) + + # Dashes + dash = name.replace(" ", "-") + if dash != name: + variations.append(dash) + + # Underscores + underscore = name.replace(" ", "_") + if underscore != name: + variations.append(underscore) + + # First word only + first_word = name.split()[0] if name else "" + if first_word and first_word not in variations: + variations.append(first_word) + + # Suffixes + for suffix in ["careers", "jobs"]: + plain = f"{name}{suffix}" + dash_suf = f"{name}-{suffix}" + if plain not in variations: + variations.append(plain) + if dash_suf not in variations: + variations.append(dash_suf) + + # Deduplicate while preserving order + seen = set() + return [v for v in variations if not (v in seen or seen.add(v))] + + +@app.command() +def verify( + slug: str = typer.Argument(..., help="Company slug to verify"), + try_variations: bool = typer.Option( + True, "--variations/--no-variations", help="Try common slug variations if not found" + ), +) -> None: + """Verify a Greenhouse company slug exists.""" + console.print(f"Verifying [bold]{slug}[/bold]...") + + is_valid, total, error = _check_slug(slug) + + if is_valid: + console.print(f"[green]✓[/green] {slug}: {total or 'jobs found'}") + raise typer.Exit(code=0) + + console.print(f"[red]✗[/red] {slug}: {error}") + + if not try_variations: + raise typer.Exit(code=1) + + # Try variations + console.print("\nTrying variations...") + variations = _generate_variations(slug) + + for i, variant in enumerate(variations[1:], 1): # Skip original + time.sleep(1) # Polite delay + is_valid, total, error = _check_slug(variant) + + if is_valid: + console.print(f"[green]✓[/green] {variant}: {total or 'jobs found'}") + raise typer.Exit(code=0) + else: + console.print(f"[red]✗[/red] {variant}: {error}") + + console.print("\n[yellow]No valid slug found[/yellow]") + raise typer.Exit(code=1) + + +@app.command() +def discover( + name: Optional[str] = typer.Argument(None, help="Company name to search for"), + url: Optional[str] = typer.Option(None, "--url", help="Career page URL to scrape"), +) -> None: + """Discover Greenhouse slugs from company name or career URL.""" + if not name and not url: + console.print("[red]Error:[/red] Provide either a company name or --url") + raise typer.Exit(code=1) + + if url: + # Scrape URL for Greenhouse references + console.print(f"Analyzing URL: {url}") + + try: + with httpx.Client(timeout=15.0, follow_redirects=True) as client: + resp = client.get(url, headers={"User-Agent": "Mozilla/5.0"}) + html = resp.text + except Exception as e: + console.print(f"[red]Error fetching URL:[/red] {e}") + raise typer.Exit(code=1) + + # Extract slugs from HTML + patterns = [ + r"boards\.greenhouse\.io/(\w+)", + r"job-boards\.greenhouse\.io/(\w+)", + r"api\.greenhouse\.io/v1/boards/(\w+)", + r"greenhouse\.io/embed/job_board\?for=(\w+)", + ] + + slugs = set() + for pattern in patterns: + matches = re.findall(pattern, html) + slugs.update(matches) + + if not slugs: + # Try hostname as fallback + hostname = urlparse(str(resp.url)).hostname or "" + if hostname: + base = hostname.replace("careers.", "").replace("jobs.", "").replace("www.", "").split(".")[0] + slugs.add(base) + + candidates = list(slugs) + else: + # Generate from name + console.print(f"Trying variations of [bold]{name}[/bold]...") + candidates = _generate_variations(name) + + # Verify candidates + console.print(f"\nChecking {len(candidates)} candidates...\n") + + for i, slug in enumerate(candidates, 1): + console.print(f" ({i}/{len(candidates)}) {slug}...", end=" ") + is_valid, total, error = _check_slug(slug) + + if is_valid: + console.print(f"[green]✓ {total} jobs[/green]") + else: + console.print(f"[red]✗ {error}[/red]") + + if i < len(candidates): + time.sleep(1) # Polite delay + + raise typer.Exit(code=0) + + +@app.command() +def validate( + fix: bool = typer.Option(False, "--fix", help="Auto-fix known slug issues"), + config_path: Optional[Path] = typer.Option(None, "--config", help="Path to greenhouse.yaml"), +) -> None: + """Validate all companies in greenhouse.yaml configuration.""" + employers = _load_config(config_path) + slugs = list(employers.keys()) + total = len(slugs) + + if total == 0: + console.print("[yellow]No employers found in configuration[/yellow]") + raise typer.Exit(code=1) + + console.print(f"Validating [bold]{total}[/bold] companies...\n") + + valid_count = 0 + invalid = {} + fixed = [] + + for i, slug in enumerate(slugs, 1): + console.print(f"Checking {i}/{total}...", end="\r") + + is_valid, total_jobs, error = _check_slug(slug) + + if is_valid: + valid_count += 1 + console.print(f"[green]✓[/green] {slug}: {total_jobs} jobs") + else: + # Try auto-fix + if fix and slug in KNOWN_FIXES: + new_slug = KNOWN_FIXES[slug] + is_valid2, total2, _ = _check_slug(new_slug) + + if is_valid2: + console.print(f"[yellow]✗[/yellow] {slug}: {error}") + console.print(f" [green]→ Fixed:[/green] {slug} → {new_slug} ({total2} jobs)") + fixed.append((slug, new_slug)) + valid_count += 1 + continue + + invalid[slug] = error + console.print(f"[red]✗[/red] {slug}: {error}") + + time.sleep(0.5) # Polite delay + + # Summary + console.print(f"\n[bold]Summary:[/bold] {valid_count}/{total} valid") + + if invalid: + console.print(f"\n[red]Invalid:[/red] {', '.join(invalid.keys())}") + + if fixed: + console.print(f"\n[green]Fixed:[/green] {len(fixed)} issue(s)") + + if valid_count == total: + console.print("\n[green]All companies valid![/green]") + raise typer.Exit(code=0) + else: + raise typer.Exit(code=1) + + +@app.command() +def list_employers( + config_path: Optional[Path] = typer.Option(None, "--config", help="Path to greenhouse.yaml"), +) -> None: + """List all configured Greenhouse employers.""" + employers = _load_config(config_path) + + if not employers: + console.print("[yellow]No employers configured[/yellow]") + raise typer.Exit(code=1) + + table = Table(title="Greenhouse Employers", show_header=True) + table.add_column("Slug", style="cyan") + table.add_column("Name", style="green") + + for slug, data in sorted(employers.items()): + name = data.get("name", slug) + table.add_row(slug, name) + + console.print(table) + console.print(f"\nTotal: {len(employers)} employers") + + +@app.command() +def add_job( + url: str = typer.Argument(..., help="Greenhouse job URL to add"), + dry_run: bool = typer.Option(False, "--dry-run", help="Preview without saving to database"), +) -> None: + """Add a specific Greenhouse job from URL and display structured data.""" + import json + from rich.panel import Panel + from rich.json import JSON + + console.print(f"🔗 Processing URL: {url}") + console.print() + + # Extract company slug and job ID from URL + match = re.search(r'greenhouse\.io/(\w+)/jobs/(\d+)', url) + if not match: + console.print("[red]✗[/red] Invalid Greenhouse URL format") + console.print("[dim]Expected: https://boards.greenhouse.io/{company}/jobs/{job_id}[/dim]") + raise typer.Exit(code=1) + + company_slug = match.group(1) + job_id = match.group(2) + + console.print(f"📍 Company: {company_slug}") + console.print(f"🆔 Job ID: {job_id}") + console.print() + + # Fetch all jobs for this company + console.print("⬇️ Fetching job data...") + from applypilot.discovery.greenhouse import fetch_jobs_api, parse_api_response, _store_jobs + + data = fetch_jobs_api(company_slug, content=True) + + if not data: + console.print("[red]✗[/red] Failed to fetch jobs from API") + raise typer.Exit(code=1) + + # Find the specific job + jobs = parse_api_response(data, company_slug.replace('-', ' ').title(), '') + job = next((j for j in jobs if str(j.get('job_id')) == job_id), None) + + if not job: + console.print(f"[red]✗[/red] Job {job_id} not found") + raise typer.Exit(code=1) + + # Display structured data + console.print("=" * 70) + console.print("[bold green]✓ Job Found[/bold green]") + console.print("=" * 70) + console.print() + + # Basic info table + info_table = Table(show_header=False, box=None) + info_table.add_column("Field", style="cyan", width=15) + info_table.add_column("Value", style="white") + + info_table.add_row("Job ID", str(job.get('job_id', 'N/A'))) + info_table.add_row("Title", job.get('title', 'N/A')) + info_table.add_row("Company", job.get('company', 'N/A')) + info_table.add_row("Location", job.get('location', 'N/A')) + info_table.add_row("Department", job.get('department', 'N/A')) + info_table.add_row("Strategy", job.get('strategy', 'N/A')) + info_table.add_row("URL", job.get('url', 'N/A')[:60] + "...") + info_table.add_row("Updated", job.get('updated_at', 'N/A')) + + console.print(Panel(info_table, title="📋 Job Information", border_style="green")) + console.print() + + # Description panel + desc = job.get('description', '') + if desc: + if len(desc) > 800: + desc = desc[:800] + "..." + console.print(Panel(desc, title="📝 Description", border_style="blue")) + console.print() + + # Full structured data (JSON) + console.print("📊 Full Structured Data (as stored in database):") + + # Create a copy with limited fields for cleaner display + display_job = { + "job_id": job.get("job_id"), + "title": job.get("title"), + "company": job.get("company"), + "location": job.get("location"), + "department": job.get("department"), + "description": job.get("description", "")[:200] + "..." if len(job.get("description", "")) > 200 else job.get("description"), + "url": job.get("url"), + "strategy": job.get("strategy"), + "updated_at": job.get("updated_at"), + } + console.print(JSON(json.dumps(display_job, indent=2, default=str))) + console.print() + + if dry_run: + console.print("[yellow]🏃 Dry run mode - job NOT saved to database[/yellow]") + else: + # Store in database + console.print("💾 Saving to database...") + try: + new, existing = _store_jobs([job]) + if new: + console.print(f"[green]✓[/green] Job saved successfully (new)") + elif existing: + console.print(f"[yellow]⚠[/yellow] Job already exists in database") + else: + console.print(f"[green]✓[/green] Job processed") + except Exception as e: + console.print(f"[red]✗[/red] Failed to save: {e}") + raise typer.Exit(code=1) + + console.print() + console.print("=" * 70) + console.print("[dim]Next: Run 'applypilot run enrich score' to process this job[/dim]") diff --git a/src/applypilot/config/greenhouse.yaml b/src/applypilot/config/greenhouse.yaml new file mode 100644 index 0000000..b1782fc --- /dev/null +++ b/src/applypilot/config/greenhouse.yaml @@ -0,0 +1,419 @@ +# Greenhouse employer registry for ApplyPilot +# These companies have been verified to use Greenhouse ATS +# URL format: https://job-boards.greenhouse.io/{company_slug} + +employers: + # ── Core AI/LLM Companies ──────────────────────────────────────────────── + + scaleai: + name: "Scale AI" + + # ── Infrastructure/Data ────────────────────────────────────────────────── + + stripe: + name: "Stripe" + + figma: + name: "Figma" + +notionhq: + name: "Notion" + + asana: + name: "Asana" + + mongodb: + name: "MongoDB" + + hashicorp: + name: "HashiCorp" + + confluent: + name: "Confluent" + + datadog: + name: "Datadog" + + cloudflare: + name: "Cloudflare" + + # ── Fintech ────────────────────────────────────────────────────────────── + + robinhood: + name: "Robinhood" + + coinbase: + name: "Coinbase" + + plaid: + name: "Plaid" + + affirm: + name: "Affirm" + + upstart: + name: "Upstart" + + brex: + name: "Brex" + + mercury: + name: "Mercury" + + # ── Productivity/Enterprise ────────────────────────────────────────────── + + zapier: + name: "Zapier" + + airtable: + name: "Airtable" + + miro: + name: "Miro" + + clickup: + name: "ClickUp" + + loom: + name: "Loom" + + retool: + name: "Retool" + + monday: + name: "monday.com" + + # ── Security/DevTools ──────────────────────────────────────────────────── + + sentry: + name: "Sentry" + + datadog: + name: "Datadog" + + vercel: + name: "Vercel" + + auth0: + name: "Auth0" + + # ── AI/ML Startups (Verified) ──────────────────────────────────────────── + + sierra: + name: "Sierra" + + moveworks: + name: "Moveworks" + + gong: + name: "Gong" + + amplitude: + name: "Amplitude" + + mixpanel: + name: "Mixpanel" + + # ── Healthcare/Biotech ─────────────────────────────────────────────────── + + tempus: + name: "Tempus" + + guardant: + name: "Guardant Health" + + grail: + name: "GRAIL" + + freenome: + name: "Freenome" + + # ── Enterprise Automation ──────────────────────────────────────────────── + + uipath: + name: "UiPath" + + dataiku: + name: "Dataiku" + + # ── Content/Media ──────────────────────────────────────────────────────── + + canva: + name: "Canva" + + docusign: + name: "DocuSign" + + squarespace: + name: "Squarespace" + + # ── Autonomous Systems ─────────────────────────────────────────────────── + + zoox: + name: "Zoox" + + nuro: + name: "Nuro" + + aurora: + name: "Aurora Innovation" + + anduril: + name: "Anduril" + + # ── Additional Verified Companies ──────────────────────────────────────── + + flexport: + name: "Flexport" + + ginkgo: + name: "Ginkgo Bioworks" + + instacart: + name: "Instacart" + + doordash: + name: "DoorDash" + + snowflake: + name: "Snowflake" + + databricks: + name: "Databricks" + + palantir: + name: "Palantir" + + benchling: + name: "Benchling" + + recursiv: + name: "Recursion" + + 23andme: + name: "23andMe" + + flatiron: + name: "Flatiron Health" + + oscar: + name: "Oscar Health" + + lemonade: + name: "Lemonade" + + root: + name: "Root Insurance" + + hippo: + name: "Hippo Insurance" + + nextinsurance: + name: "Next Insurance" + + coalition: + name: "Coalition" + + "23andme": + name: "23andMe" + + determinedai: + name: "Determined AI" + + bananaml: + name: "Banana ML" + + baseten: + name: "Baseten" + + replicate: + name: "Replicate" + + synthesia: + name: "Synthesia" + + runway: + name: "Runway" + + stability: + name: "Stability AI" + + jasper: + name: "Jasper" + + writer: + name: "Writer" + + copyai: + name: "Copy.ai" + + ada: + name: "Ada" + + kustomer: + name: "Kustomer" + + cognigy: + name: "Cognigy" + + forethought: + name: "Forethought" + + koreai: + name: "Kore.ai" + + bigpanda: + name: "BigPanda" + + observe: + name: "Observe" + + cresta: + name: "Cresta" + + darktrace: + name: "Darktrace" + + abnormalsecurity: + name: "Abnormal Security" + + perimeterx: + name: "PerimeterX" + + vadesecure: + name: "Vade Secure" + + clearbit: + name: "Clearbit" + + crunchbase: + name: "Crunchbase" + + zoominfo: + name: "ZoomInfo" + + outreach: + name: "Outreach" + + salesloft: + name: "SalesLoft" + + apolloio: + name: "Apollo.io" + + chorusai: + name: "Chorus.ai" + + workato: + name: "Workato" + + n8n: + name: "n8n" + + make: + name: "Make" + + workfusion: + name: "WorkFusion" + + blueprism: + name: "Blue Prism" + + automationanywhere: + name: "Automation Anywhere" + + instabase: + name: "Instabase" + + hyperscience: + name: "Hyperscience" + + veritone: + name: "Veritone" + + alteryx: + name: "Alteryx" + + domino: + name: "Domino Data Lab" + + neptuneai: + name: "Neptune.ai" + + clearml: + name: "ClearML" + + launchdarkly: + name: "LaunchDarkly" + + weave: + name: "Weave" + + wayve: + name: "Wayve" + + appliedintuition: + name: "Applied Intuition" + + shieldai: + name: "Shield AI" + + samsara: + name: "Samsara" + + astranis: + name: "Astranis" + + planetlabs: + name: "Planet Labs" + + pathai: + name: "PathAI" + + paige: + name: "Paige" + + insitro: + name: "Insitro" + + zebra: + name: "Zebra Medical Vision" + + aidoc: + name: "Aidoc" + + vizai: + name: "Viz.ai" + + infermedica: + name: "Infermedica" + + buoyhealth: + name: "Buoy Health" + + khealth: + name: "K Health" + + adahealth: + name: "Ada Health" + + babylonhealth: + name: "Babylon Health" + + veracyte: + name: "Veracyte" + + foundationmedicine: + name: "Foundation Medicine" + + klarna: + name: "Klarna" + + metromile: + name: "Metromile" + + deserve: + name: "Deserve" + + navi: + name: "Navi" diff --git a/src/applypilot/discovery/greenhouse.py b/src/applypilot/discovery/greenhouse.py new file mode 100644 index 0000000..65582ff --- /dev/null +++ b/src/applypilot/discovery/greenhouse.py @@ -0,0 +1,394 @@ +"""Greenhouse ATS discovery: fetches jobs from Greenhouse Job Board API. + +Greenhouse is used by ~60% of AI/ML startups (OpenAI, Anthropic, Scale AI, etc.). +Uses the official public Job Board API: https://boards-api.greenhouse.io/v1/boards/{token}/jobs +""" + +import logging +import re +import sqlite3 +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone +from typing import Optional + +import httpx +import yaml + +from applypilot import config +from applypilot.config import APP_DIR, CONFIG_DIR +from applypilot.database import get_connection + +log = logging.getLogger(__name__) + +# Greenhouse Job Board API endpoint +GREENHOUSE_API_BASE = "https://boards-api.greenhouse.io/v1/boards" + + +def load_employers() -> dict: + """Load Greenhouse employer registry. + + Tries user config first (~/.applypilot/greenhouse.yaml), + falls back to package config. + """ + # Try user config + user_path = APP_DIR / "greenhouse.yaml" + if user_path.exists(): + log.info("Loading user Greenhouse config from %s", user_path) + try: + data = yaml.safe_load(user_path.read_text(encoding="utf-8")) + if data and "employers" in data: + return data.get("employers", {}) + except Exception as e: + log.warning("Failed to load user config: %s", e) + + # Fall back to package config + package_path = CONFIG_DIR / "greenhouse.yaml" + if not package_path.exists(): + log.warning("greenhouse.yaml not found at %s", package_path) + return {} + + try: + data = yaml.safe_load(package_path.read_text(encoding="utf-8")) + return data.get("employers", {}) + except Exception as e: + log.error("Failed to load package config: %s", e) + return {} + + +def _load_location_filter(search_cfg: dict | None = None): + """Load location accept/reject lists from search config.""" + if search_cfg is None: + search_cfg = config.load_search_config() + + accept = search_cfg.get("location_accept", []) + reject = search_cfg.get("location_reject_non_remote", []) + return accept, reject + + +def _location_ok(location: str | None, accept: list[str], reject: list[str]) -> bool: + """Check if a job location passes the user's location filter.""" + if not location: + return True + + loc = location.lower() + + if any(r in loc for r in ("remote", "anywhere", "work from home", "wfh", "distributed")): + return True + + for r in reject: + if r.lower() in loc: + return False + + for a in accept: + if a.lower() in loc: + return True + + return False + + +def _title_matches_query(title: str, query: str) -> bool: + """Check if job title matches search query (simple keyword matching).""" + if not query: + return True + + title_lower = title.lower() + query_terms = query.lower().split() + + # Match if any query term appears in title + return any(term in title_lower for term in query_terms) + + +def _strip_html(html_content: str) -> str: + """Strip HTML tags from content to get plain text.""" + if not html_content: + return "" + + # Simple regex to remove HTML tags + text = re.sub(r"<[^>]+>", "", html_content) + # Normalize whitespace + text = re.sub(r"\s+", " ", text).strip() + return text + + +def fetch_jobs_api(board_token: str, content: bool = True) -> dict | None: + """Fetch jobs from Greenhouse Job Board API. + + Args: + board_token: The company slug (e.g., "stripe", "robinhood") + content: If True, include full job description in response + + Returns: + API response dict with "jobs" and "meta" keys, or None on error + """ + url = f"{GREENHOUSE_API_BASE}/{board_token}/jobs" + params = {"content": "true"} if content else {} + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "application/json", + } + + try: + with httpx.Client(timeout=30.0, follow_redirects=True) as client: + resp = client.get(url, headers=headers, params=params) + + if resp.status_code == 404: + log.debug("Board not found: %s", board_token) + return None + elif resp.status_code == 429: + log.warning("Rate limited for %s, retrying...", board_token) + time.sleep(2) + resp = client.get(url, headers=headers, params=params) + resp.raise_for_status() + else: + resp.raise_for_status() + + return resp.json() + + except httpx.HTTPStatusError as e: + log.warning("HTTP error for %s: %s", board_token, e) + return None + except Exception as e: + log.warning("Failed to fetch %s: %s", board_token, e) + return None + + +def parse_api_response(data: dict, company_name: str, query: str = "") -> list[dict]: + """Parse job listings from Greenhouse API response. + + Args: + data: API response dict with "jobs" key + company_name: Display name of the company + query: Optional query string to filter jobs + + Returns: + List of job dicts with standardized fields + """ + jobs = [] + job_list = data.get("jobs", []) + + for job_data in job_list: + try: + title = job_data.get("title", "") + if not title: + continue + + # Filter by query + if query and not _title_matches_query(title, query): + continue + + # Extract location + location_obj = job_data.get("location", {}) + location = location_obj.get("name", "") if isinstance(location_obj, dict) else str(location_obj) + + # Extract department + departments = job_data.get("departments", []) + department = departments[0].get("name", "") if departments else "" + + # Extract offices + offices = job_data.get("offices", []) + office_names = [office.get("name", "") for office in offices if office.get("name")] + + # Get full description and strip HTML + html_content = job_data.get("content", "") + description = _strip_html(html_content) + + # Build job dict + job = { + "title": title, + "company": company_name, + "location": location, + "department": department, + "offices": office_names, + "url": job_data.get("absolute_url", ""), + "strategy": "greenhouse", + # New fields from API + "job_id": job_data.get("id"), + "internal_job_id": job_data.get("internal_job_id"), + "description": description, + "updated_at": job_data.get("updated_at"), + } + + jobs.append(job) + + except Exception as e: + log.debug("Error parsing job: %s", e) + continue + + return jobs + + +def search_employer( + employer_key: str, + employer: dict, + search_text: str, + location_filter: bool = True, + accept_locs: list[str] | None = None, + reject_locs: list[str] | None = None, +) -> list[dict]: + """Search a single Greenhouse employer via API.""" + log.info('%s: searching "%s"...', employer["name"], search_text) + + # Fetch from API + api_data = fetch_jobs_api(employer_key, content=True) + if not api_data: + return [] + + jobs = parse_api_response(api_data, employer["name"], search_text) + + # Apply location filter + if location_filter and (accept_locs or reject_locs): + filtered = [] + for job in jobs: + if _location_ok(job.get("location"), accept_locs or [], reject_locs or []): + filtered.append(job) + jobs = filtered + + log.info("%s: found %d jobs", employer["name"], len(jobs)) + return jobs + + +def search_all( + search_text: str, + workers: int = 4, + location_filter: bool = True, + _employers_override: dict | None = None, +) -> tuple[int, int]: + """Search all configured Greenhouse employers via API. + + Returns (new_jobs_count, existing_jobs_count). + """ + employers = _employers_override if _employers_override else load_employers() + if not employers: + log.warning("No Greenhouse employers configured") + return 0, 0 + + accept_locs, reject_locs = _load_location_filter() + + log.info('Greenhouse API search: %d employers, "%s", workers=%d', len(employers), search_text, workers) + + all_jobs = [] + errors = 0 + + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit( + search_employer, + key, + emp, + search_text, + location_filter, + accept_locs, + reject_locs, + ): key + for key, emp in employers.items() + } + + for future in as_completed(futures): + key = futures[future] + try: + jobs = future.result() + all_jobs.extend(jobs) + except Exception as e: + log.error("Error searching %s: %s", key, e) + errors += 1 + + log.info( + "Greenhouse API search complete: %d total jobs from %d employers (%d errors)", + len(all_jobs), + len(employers), + errors, + ) + + # Store in database + return _store_jobs(all_jobs) + + +def _store_jobs(jobs: list[dict]) -> tuple[int, int]: + """Store discovered jobs in the database. Returns (new, existing).""" + conn = get_connection() + now = datetime.now(timezone.utc).isoformat() + new = 0 + existing = 0 + + for job in jobs: + try: + conn.execute( + "INSERT INTO jobs (url, title, salary, description, location, site, strategy, " + "discovered_at, full_description, application_url, detail_scraped_at, detail_error) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + job["url"], + job["title"], + None, # salary not provided by API + job.get("description", ""), # Now we have full description! + job.get("location", ""), + job["company"], + "greenhouse", + now, + job.get("description"), # full_description + job["url"], # application_url + now, # detail_scraped_at (we got it from API) + None, + ), + ) + new += 1 + except sqlite3.IntegrityError: + existing += 1 + + conn.commit() + return new, existing + + +def run_all_searches( + searches: list[dict], + workers: int = 4, +) -> dict: + """Run multiple search queries across all Greenhouse employers. + + Args: + searches: List of search configs with 'query' key + workers: Number of parallel threads + + Returns: + Dict with total new/existing counts and per-query breakdown + """ + total_new = 0 + total_existing = 0 + per_query = [] + + for search in searches: + query = search.get("query", "") + log.info('Greenhouse API search: "%s"', query) + + new, existing = search_all(query, workers=workers) + total_new += new + total_existing += existing + + per_query.append( + { + "query": query, + "new": new, + "existing": existing, + } + ) + + return { + "total_new": total_new, + "total_existing": total_existing, + "per_query": per_query, + } + + +# Legacy functions for backward compatibility (deprecated) +def fetch_greenhouse_board(company_slug: str) -> str | None: + """DEPRECATED: Use fetch_jobs_api() instead.""" + log.warning("fetch_greenhouse_board() is deprecated, use fetch_jobs_api()") + return None + + +def parse_greenhouse_jobs(html: str, company_name: str, query: str = "") -> list[dict]: + """DEPRECATED: Use parse_api_response() instead.""" + log.warning("parse_greenhouse_jobs() is deprecated, use parse_api_response()") + return [] diff --git a/src/applypilot/pipeline.py b/src/applypilot/pipeline.py index 29881c5..680b37b 100644 --- a/src/applypilot/pipeline.py +++ b/src/applypilot/pipeline.py @@ -61,7 +61,7 @@ def _run_discover(workers: int = 1) -> dict: """Stage: Job discovery — JobSpy, Workday, and smart-extract scrapers.""" - stats: dict = {"jobspy": None, "workday": None, "smartextract": None} + stats: dict = {"jobspy": None, "workday": None, "smartextract": None, "greenhouse": None} # JobSpy console.print(" [cyan]JobSpy full crawl...[/cyan]") @@ -96,6 +96,17 @@ def _run_discover(workers: int = 1) -> dict: console.print(f" [red]Smart extract error:[/red] {e}") stats["smartextract"] = f"error: {e}" + # Greenhouse ATS scraper + console.print(" [cyan]Greenhouse ATS scraper (AI startups)...[/cyan]") + try: + from applypilot.discovery.greenhouse import search_all + new, existing = search_all("", workers=workers) + stats["greenhouse"] = f"ok ({new} new, {existing} existing)" + except Exception as e: + log.error("Greenhouse scraper failed: %s", e) + console.print(f" [red]Greenhouse error:[/red] {e}") + stats["greenhouse"] = f"error: {e}" + return stats diff --git a/tests/discovery/test_greenhouse.py b/tests/discovery/test_greenhouse.py new file mode 100644 index 0000000..ebdb50a --- /dev/null +++ b/tests/discovery/test_greenhouse.py @@ -0,0 +1,541 @@ +"""Unit tests for Greenhouse ATS discovery module.""" + +import os +import sys +import sqlite3 +from datetime import datetime, timezone +from unittest.mock import Mock, patch + +import pytest + +# Ensure src is on sys.path for tests when running from repo root +ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +SRC = os.path.join(ROOT, "src") +if SRC not in sys.path: + sys.path.insert(0, SRC) + +from applypilot.discovery.greenhouse import ( + GREENHOUSE_API_BASE, + _location_ok, + _store_jobs, + _title_matches_query, + fetch_jobs_api, + load_employers, + parse_api_response, + search_employer, +) + + +class TestLoadEmployers: + """Tests for load_employers function.""" + + def test_loads_employers_from_yaml(self): + """Test that employers are loaded from greenhouse.yaml config.""" + employers = load_employers() + assert isinstance(employers, dict) + assert len(employers) > 0 + # Check for known employers + assert "scaleai" in employers + assert employers["scaleai"]["name"] == "Scale AI" + + def test_returns_empty_dict_if_file_missing(self, tmp_path): + """Test graceful handling of missing config file.""" + with patch("applypilot.discovery.greenhouse.CONFIG_DIR", tmp_path): + employers = load_employers() + assert employers == {} + + +class TestLocationFiltering: + """Tests for location filtering functions.""" + + def test_remote_jobs_always_accepted(self): + """Remote jobs should pass any filter.""" + accept = ["San Francisco"] + reject = ["New York"] + + remote_locations = [ + "Remote", + "Anywhere", + "Work from home", + "WFH", + "Distributed", + "Fully remote", + ] + + for loc in remote_locations: + assert _location_ok(loc, accept, reject) is True + + def test_reject_locations_blocked(self): + """Jobs in reject list should be filtered out.""" + accept = ["CA", "California"] + reject = ["New York", "NYC"] + + assert _location_ok("New York, NY", accept, reject) is False + assert _location_ok("NYC Office", accept, reject) is False + assert _location_ok("San Francisco, CA", accept, reject) is True + + def test_accept_locations_required(self): + """Non-remote jobs must match accept list.""" + accept = ["San Francisco", "California"] + reject = [] + + assert _location_ok("San Francisco, CA", accept, reject) is True + assert _location_ok("Los Angeles, CA", accept, reject) is False + assert _location_ok("", accept, reject) is True # Unknown location passes + + def test_case_insensitive_matching(self): + """Location matching should be case-insensitive.""" + accept = ["san francisco"] + reject = ["new york"] + + assert _location_ok("San Francisco, CA", accept, reject) is True + assert _location_ok("NEW YORK", accept, reject) is False + + +class TestTitleMatching: + """Tests for query title matching.""" + + def test_empty_query_matches_all(self): + """Empty query should match any title.""" + assert _title_matches_query("Software Engineer", "") is True + assert _title_matches_query("", "") is True + + def test_single_keyword_match(self): + """Single keyword should match if in title.""" + assert _title_matches_query("Machine Learning Engineer", "machine learning") is True + assert _title_matches_query("Software Engineer", "machine learning") is False + + def test_multiple_keywords_any_match(self): + """Multiple keywords should match if any present.""" + assert _title_matches_query("Machine Learning Engineer", "machine learning AI") is True + assert _title_matches_query("AI Researcher", "machine learning AI") is True + assert _title_matches_query("Data Scientist", "machine learning AI") is False + + def test_case_insensitive(self): + """Matching should be case-insensitive.""" + assert _title_matches_query("MACHINE LEARNING Engineer", "machine learning") is True + assert _title_matches_query("software engineer", "SOFTWARE") is True + + +class TestParseGreenhouseJobs: + """Tests for API parsing functions (legacy HTML tests remain where relevant).""" + + def test_parse_simple_api_job(self): + """Test parsing a simple job posting from API JSON.""" + api_response = { + "jobs": [ + { + "id": 12345, + "title": "Software Engineer", + "location": {"name": "San Francisco, CA"}, + "absolute_url": "https://boards.greenhouse.io/test/jobs/12345", + "content": "
Great role
", + "departments": [{"name": "Engineering"}], + "updated_at": "2026-02-27T00:00:00Z", + } + ] + } + + jobs = parse_api_response(api_response, "Test Company", "") + + assert len(jobs) == 1 + job = jobs[0] + assert job["title"] == "Software Engineer" + assert job["company"] == "Test Company" + assert job["location"] == "San Francisco, CA" + assert job["department"] == "Engineering" + assert job["strategy"] == "greenhouse" + assert job["url"] == "https://boards.greenhouse.io/test/jobs/12345" + assert job["job_id"] == 12345 + assert job["description"] == "Great role" + assert job["updated_at"] == "2026-02-27T00:00:00Z" + + def test_parse_multiple_api_jobs(self): + """Test parsing multiple jobs from API JSON.""" + api_response = { + "jobs": [ + { + "id": 1, + "title": "Frontend Engineer", + "location": {"name": "Remote"}, + "absolute_url": "https://.../1", + "content": "a
", + "departments": [{"name": "Eng"}], + "updated_at": "2026-02-27T00:00:00Z", + }, + { + "id": 2, + "title": "Backend Engineer", + "location": {"name": "New York, NY"}, + "absolute_url": "https://.../2", + "content": "b
", + "departments": [{"name": "Eng"}], + "updated_at": "2026-02-27T00:00:00Z", + }, + ] + } + + jobs = parse_api_response(api_response, "Test Company", "") + + assert len(jobs) == 2 + assert jobs[0]["title"] == "Frontend Engineer" + assert jobs[1]["title"] == "Backend Engineer" + + def test_filter_by_query_api(self): + """Test filtering API jobs by query string.""" + api_response = { + "jobs": [ + { + "id": 1, + "title": "Machine Learning Engineer", + "location": {"name": "Remote"}, + "absolute_url": "https://.../1", + "content": "a
", + "departments": [], + "updated_at": "2026-02-27T00:00:00Z", + }, + { + "id": 2, + "title": "Sales Representative", + "location": {"name": "Remote"}, + "absolute_url": "https://.../2", + "content": "b
", + "departments": [], + "updated_at": "2026-02-27T00:00:00Z", + }, + ] + } + + jobs = parse_api_response(api_response, "Test Company", "machine learning") + + assert len(jobs) == 1 + assert jobs[0]["title"] == "Machine Learning Engineer" + + def test_offices_and_absolute_url_api(self): + """Test that offices field is parsed and absolute URLs are preserved in API response.""" + api_response = { + "jobs": [ + { + "id": 123, + "title": "Test Job", + "location": {"name": "Remote"}, + "absolute_url": "https://boards.greenhouse.io/test/jobs/123", + "content": "x
", + "departments": [], + "offices": [{"name": "SF Office"}], + "updated_at": "2026-02-27T00:00:00Z", + } + ] + } + + jobs = parse_api_response(api_response, "Test Company", "") + + assert len(jobs) == 1 + assert jobs[0]["url"] == "https://boards.greenhouse.io/test/jobs/123" + assert jobs[0]["offices"] == ["SF Office"] + + def test_handles_empty_html(self): + """Legacy: test handling of empty input via API parser (empty dict).""" + jobs = parse_api_response({}, "Test Company", "") + assert jobs == [] + + def test_handles_no_job_posts(self): + """Test handling of API response without jobs key or empty list.""" + jobs = parse_api_response({"jobs": []}, "Test Company", "") + assert jobs == [] + + +class TestFetchJobsAPI: + """Tests for HTTP fetching functions using the API client.""" + + @patch("applypilot.discovery.greenhouse.httpx.Client") + def test_successful_fetch(self, mock_client_class): + """Test successful API fetch returning JSON.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"jobs": []} + mock_response.raise_for_status = Mock() + + mock_client = Mock() + mock_client.get.return_value = mock_response + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + + mock_client_class.return_value = mock_client + + result = fetch_jobs_api("testcompany") + + assert result == {"jobs": []} + mock_client.get.assert_called_once() + + @patch("applypilot.discovery.greenhouse.httpx.Client") + def test_failed_fetch_returns_none(self, mock_client_class): + """Test that failed fetches return None gracefully.""" + mock_client = Mock() + mock_client.get.side_effect = Exception("Connection error") + mock_client.__enter__ = Mock(return_value=mock_client) + mock_client.__exit__ = Mock(return_value=False) + + mock_client_class.return_value = mock_client + + result = fetch_jobs_api("testcompany") + + assert result is None + + def test_url_format(self): + """Test that API base URL composes correctly.""" + assert f"{GREENHOUSE_API_BASE}/scaleai/jobs" == "https://boards-api.greenhouse.io/v1/boards/scaleai/jobs" + assert f"{GREENHOUSE_API_BASE}/stripe/jobs" == "https://boards-api.greenhouse.io/v1/boards/stripe/jobs" + + +class TestSearchEmployer: + """Tests for employer search function.""" + + @patch("applypilot.discovery.greenhouse.fetch_jobs_api") + @patch("applypilot.discovery.greenhouse.parse_api_response") + def test_search_with_location_filter(self, mock_parse, mock_fetch): + """Test searching with location filter enabled.""" + # fetch_jobs_api returns API dict, parse_api_response returns normalized job list + mock_fetch.return_value = {"jobs": []} + mock_parse.return_value = [ + { + "title": "Engineer", + "company": "Test", + "location": "San Francisco, CA", + "department": "Engineering", + "url": "https://example.com/job", + "strategy": "greenhouse", + } + ] + + employer = {"name": "Test Company"} + jobs = search_employer( + "test", + employer, + "engineer", + location_filter=True, + accept_locs=["San Francisco"], + reject_locs=["New York"], + ) + + assert len(jobs) == 1 + mock_fetch.assert_called_once_with("test", content=True) + mock_parse.assert_called_once_with({"jobs": []}, "Test Company", "engineer") + + @patch("applypilot.discovery.greenhouse.fetch_jobs_api") + def test_search_no_api_returns_empty(self, mock_fetch): + """Test that empty result is returned if API fetch fails.""" + mock_fetch.return_value = None + + employer = {"name": "Test Company"} + jobs = search_employer("test", employer, "engineer") + + assert jobs == [] + + +class TestStoreJobs: + """Tests for database storage functions.""" + + def test_store_new_jobs(self, tmp_path): + """Test storing new jobs in database.""" + db_path = tmp_path / "test.db" + conn = sqlite3.connect(str(db_path)) + + # Create jobs table + conn.execute(""" + CREATE TABLE jobs ( + url TEXT PRIMARY KEY, + title TEXT, + salary TEXT, + description TEXT, + location TEXT, + site TEXT, + strategy TEXT, + discovered_at TEXT, + full_description TEXT, + application_url TEXT, + detail_scraped_at TEXT, + detail_error TEXT + ) + """) + + jobs = [ + { + "title": "Test Job", + "company": "Test Company", + "location": "Remote", + "department": "Engineering", + "url": "https://example.com/job1", + "strategy": "greenhouse", + } + ] + + with patch("applypilot.discovery.greenhouse.get_connection", return_value=conn): + new, existing = _store_jobs(jobs) + + assert new == 1 + assert existing == 0 + + # Verify job was stored + cursor = conn.execute("SELECT title, site FROM jobs WHERE url = ?", ("https://example.com/job1",)) + row = cursor.fetchone() + assert row[0] == "Test Job" + assert row[1] == "Test Company" + + def test_store_duplicate_jobs(self, tmp_path): + """Test that duplicate jobs are counted as existing.""" + db_path = tmp_path / "test.db" + conn = sqlite3.connect(str(db_path)) + + # Create jobs table + conn.execute(""" + CREATE TABLE jobs ( + url TEXT PRIMARY KEY, + title TEXT, + salary TEXT, + description TEXT, + location TEXT, + site TEXT, + strategy TEXT, + discovered_at TEXT, + full_description TEXT, + application_url TEXT, + detail_scraped_at TEXT, + detail_error TEXT + ) + """) + + jobs = [ + { + "title": "Test Job", + "company": "Test Company", + "location": "Remote", + "department": "Engineering", + "url": "https://example.com/job1", + "strategy": "greenhouse", + } + ] + + with patch("applypilot.discovery.greenhouse.get_connection", return_value=conn): + # Store first time + new, existing = _store_jobs(jobs) + assert new == 1 + + # Store again - should be duplicate + new, existing = _store_jobs(jobs) + assert new == 0 + assert existing == 1 + + def test_store_multiple_jobs(self, tmp_path): + """Test storing multiple jobs at once.""" + db_path = tmp_path / "test.db" + conn = sqlite3.connect(str(db_path)) + + conn.execute(""" + CREATE TABLE jobs ( + url TEXT PRIMARY KEY, + title TEXT, + salary TEXT, + description TEXT, + location TEXT, + site TEXT, + strategy TEXT, + discovered_at TEXT, + full_description TEXT, + application_url TEXT, + detail_scraped_at TEXT, + detail_error TEXT + ) + """) + + jobs = [ + { + "title": "Job 1", + "company": "Company A", + "location": "Remote", + "department": "Engineering", + "url": "https://example.com/job1", + "strategy": "greenhouse", + }, + { + "title": "Job 2", + "company": "Company B", + "location": "NYC", + "department": "Sales", + "url": "https://example.com/job2", + "strategy": "greenhouse", + }, + ] + + with patch("applypilot.discovery.greenhouse.get_connection", return_value=conn): + new, existing = _store_jobs(jobs) + + assert new == 2 + assert existing == 0 + + count = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0] + assert count == 2 + + +class TestIntegration: + """Integration-style tests.""" + + @patch("applypilot.discovery.greenhouse.fetch_jobs_api") + @patch("applypilot.discovery.greenhouse.get_connection") + def test_end_to_end_search_and_store(self, mock_get_conn, mock_fetch, tmp_path): + """Test full flow from fetch to parse to store using the API client.""" + # Setup mock API response + api_response = { + "jobs": [ + { + "id": 123, + "title": "ML Engineer", + "location": {"name": "Remote"}, + "absolute_url": "https://boards.greenhouse.io/test/jobs/123", + "content": "ML role
", + "departments": [{"name": "Engineering"}], + "updated_at": "2026-02-27T00:00:00Z", + } + ] + } + mock_fetch.return_value = api_response + + # Setup mock DB + db_path = tmp_path / "test.db" + conn = sqlite3.connect(str(db_path)) + conn.execute(""" + CREATE TABLE jobs ( + url TEXT PRIMARY KEY, + title TEXT, + salary TEXT, + description TEXT, + location TEXT, + site TEXT, + strategy TEXT, + discovered_at TEXT, + full_description TEXT, + application_url TEXT, + detail_scraped_at TEXT, + detail_error TEXT + ) + """) + mock_get_conn.return_value = conn + + # Run search (returns jobs but doesn't store them - search_all does that) + employer = {"name": "TestCorp"} + jobs = search_employer("testcorp", employer, "ML") + + # Verify jobs returned + assert len(jobs) == 1 + assert jobs[0]["title"] == "ML Engineer" + + # Manually store jobs to test _store_jobs integration + from applypilot.discovery.greenhouse import _store_jobs + + new, existing = _store_jobs(jobs) + + # Verify stored in DB + assert new == 1 + assert existing == 0 + count = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0] + assert count == 1