diff --git a/Modules/Package.swift b/Modules/Package.swift index 05537222fc35..e6a728bf73cd 100644 --- a/Modules/Package.swift +++ b/Modules/Package.swift @@ -20,6 +20,7 @@ let package = Package( .library(name: "WordPressFlux", targets: ["WordPressFlux"]), .library(name: "WordPressShared", targets: ["WordPressShared"]), .library(name: "WordPressUI", targets: ["WordPressUI"]), + .library(name: "WordPressIntelligence", targets: ["WordPressIntelligence"]), .library(name: "WordPressReader", targets: ["WordPressReader"]), .library(name: "WordPressCore", targets: ["WordPressCore"]), .library(name: "WordPressCoreProtocols", targets: ["WordPressCoreProtocols"]), @@ -163,6 +164,10 @@ let package = Package( // This package should never have dependencies – it exists to expose protocols implemented in WordPressCore // to UI code, because `wordpress-rs` doesn't work nicely with previews. ]), + .target(name: "WordPressIntelligence", dependencies: [ + "WordPressShared", + .product(name: "SwiftSoup", package: "SwiftSoup"), + ]), .target(name: "WordPressLegacy", dependencies: ["DesignSystem", "WordPressShared"]), .target(name: "WordPressSharedObjC", resources: [.process("Resources")], swiftSettings: [.swiftLanguageMode(.v5)]), .target( @@ -251,6 +256,7 @@ let package = Package( .testTarget(name: "WordPressSharedObjCTests", dependencies: [.target(name: "WordPressShared"), .target(name: "WordPressTesting")], swiftSettings: [.swiftLanguageMode(.v5)]), .testTarget(name: "WordPressUIUnitTests", dependencies: [.target(name: "WordPressUI")], swiftSettings: [.swiftLanguageMode(.v5)]), .testTarget(name: "WordPressCoreTests", dependencies: [.target(name: "WordPressCore")]), + .testTarget(name: "WordPressIntelligenceTests", dependencies: [.target(name: "WordPressIntelligence")]) ] ) @@ -348,6 +354,7 @@ enum XcodeSupport { "ShareExtensionCore", "Support", "WordPressFlux", + "WordPressIntelligence", "WordPressShared", "WordPressLegacy", "WordPressReader", diff --git a/Modules/Sources/WordPressIntelligence/IntelligenceService.swift b/Modules/Sources/WordPressIntelligence/IntelligenceService.swift new file mode 100644 index 000000000000..3a83d5ae9529 --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/IntelligenceService.swift @@ -0,0 +1,52 @@ +import Foundation +import FoundationModels + +public enum IntelligenceService { + /// Maximum context size for language model sessions (in tokens). + /// + /// A single token corresponds to three or four characters in languages like + /// English, Spanish, or German, and one token per character in languages like + /// Japanese, Chinese, or Korean. In a single session, the sum of all tokens + /// in the instructions, all prompts, and all outputs count toward the context window size. + /// + /// https://developer.apple.com/documentation/foundationmodels/generating-content-and-performing-tasks-with-foundation-models#Consider-context-size-limits-per-session + public static let contextSizeLimit = 4096 + + /// Checks if intelligence features are supported on the current device. + public nonisolated static var isSupported: Bool { + guard #available(iOS 26, *) else { + return false + } + switch SystemLanguageModel.default.availability { + case .available: + return true + case .unavailable(let reason): + switch reason { + case .appleIntelligenceNotEnabled, .modelNotReady: + return true + case .deviceNotEligible: + return false + @unknown default: + return false + } + } + } + + /// Extracts relevant text from post content, removing HTML and limiting size. + public static func extractRelevantText(from post: String, ratio: CGFloat = 0.6) -> String { + let extract = try? ContentExtractor.extractRelevantText(from: post) + let postSizeLimit = Double(IntelligenceService.contextSizeLimit) * ratio + return String((extract ?? post).prefix(Int(postSizeLimit))) + } + + // As documented in https://developer.apple.com/documentation/foundationmodels/supporting-languages-and-locales-with-foundation-models?changes=_10_5#Use-Instructions-to-set-the-locale-and-language + static func makeLocaleInstructions(for locale: Locale = Locale.current) -> String { + if Locale.Language(identifier: "en_US").isEquivalent(to: locale.language) { + // Skip the locale phrase for U.S. English. + return "" + } else { + // Specify the person's locale with the exact phrase format. + return "The person's locale is \(locale.identifier)." + } + } +} diff --git a/Modules/Sources/WordPressIntelligence/Parameters/ContentLength.swift b/Modules/Sources/WordPressIntelligence/Parameters/ContentLength.swift new file mode 100644 index 000000000000..ef76c7ddc1da --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/Parameters/ContentLength.swift @@ -0,0 +1,58 @@ +import Foundation +import WordPressShared + +/// Target length for generated text. +/// +/// Ranges are calibrated for English and account for cross-language variance. +/// Sentences are the primary indicator; word counts accommodate language differences. +/// +/// - **Short**: 1-2 sentences (15-35 words) - Social media, search snippets +/// - **Medium**: 2-4 sentences (30-90 words) - RSS feeds, blog listings +/// - **Long**: 5-7 sentences (90-130 words) - Detailed previews, newsletters +/// +/// Word ranges are intentionally wide (2-2.3x) to handle differences in language +/// structure (German compounds, Romance wordiness, CJK tokenization). +public enum ContentLength: Int, CaseIterable, Sendable { + case short + case medium + case long + + public var displayName: String { + switch self { + case .short: + AppLocalizedString("generation.length.short", value: "Short", comment: "Generated content length (needs to be short)") + case .medium: + AppLocalizedString("generation.length.medium", value: "Medium", comment: "Generated content length (needs to be short)") + case .long: + AppLocalizedString("generation.length.long", value: "Long", comment: "Generated content length (needs to be short)") + } + } + + public var trackingName: String { + switch self { + case .short: "short" + case .medium: "medium" + case .long: "long" + } + } + + public var promptModifier: String { + "\(sentenceRange.lowerBound)-\(sentenceRange.upperBound) sentences (\(wordRange.lowerBound)-\(wordRange.upperBound) words)" + } + + public var sentenceRange: ClosedRange { + switch self { + case .short: 1...2 + case .medium: 2...4 + case .long: 5...7 + } + } + + public var wordRange: ClosedRange { + switch self { + case .short: 15...35 + case .medium: 40...80 + case .long: 90...130 + } + } +} diff --git a/Modules/Sources/WordPressIntelligence/Parameters/WritingStyle.swift b/Modules/Sources/WordPressIntelligence/Parameters/WritingStyle.swift new file mode 100644 index 000000000000..0d39098730fe --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/Parameters/WritingStyle.swift @@ -0,0 +1,40 @@ +import Foundation +import WordPressShared + +/// Writing style for generated text. +public enum WritingStyle: String, CaseIterable, Sendable { + case engaging + case conversational + case witty + case formal + case professional + + public var displayName: String { + switch self { + case .engaging: + AppLocalizedString("generation.style.engaging", value: "Engaging", comment: "AI generation style") + case .conversational: + AppLocalizedString("generation.style.conversational", value: "Conversational", comment: "AI generation style") + case .witty: + AppLocalizedString("generation.style.witty", value: "Witty", comment: "AI generation style") + case .formal: + AppLocalizedString("generation.style.formal", value: "Formal", comment: "AI generation style") + case .professional: + AppLocalizedString("generation.style.professional", value: "Professional", comment: "AI generation style") + } + } + + var promptModifier: String { + "\(rawValue) (\(promptModifierDetails))" + } + + var promptModifierDetails: String { + switch self { + case .engaging: "engaging and compelling tone" + case .witty: "witty, creative, entertaining" + case .conversational: "friendly and conversational tone" + case .formal: "formal and academic tone" + case .professional: "professional and polished tone" + } + } +} diff --git a/Modules/Sources/WordPressIntelligence/README.md b/Modules/Sources/WordPressIntelligence/README.md new file mode 100644 index 000000000000..93f83f088781 --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/README.md @@ -0,0 +1,143 @@ +# WordPressIntelligence + +AI-powered content intelligence for WordPress using Apple Foundation Models. + +## Features + +- **Excerpt Generation** - Generate 3 excerpt variations in 8 languages with configurable length/style +- **Tag Suggestions** - AI-powered tag recommendations +- **Post Summaries** - Automatic content summarization + +## Requirements + +- iOS 26.0+ +- Device with Apple Intelligence support + +## Usage + +```swift +let generator = ExcerptGeneration(length: .medium, style: .engaging) +let excerpts = try await generator.generate(for: postContent) +``` + +**Languages**: English, Spanish, French, German, Italian, Portuguese, Japanese, Chinese +**Lengths**: Short (15-35 words), Medium (40-80 words), Long (90-130 words) +**Styles**: Engaging, Professional, Conversational, Formal, Witty + +## Testing + +### Standard XCTest + +Run standard tests that verify language, length, and diversity: + +```bash +cd Modules +xcodebuild test \ + -scheme Modules-Package \ + -destination 'platform=iOS Simulator,name=iPhone 16 Pro,OS=26.0' \ + -only-testing:WordPressIntelligenceTests +``` + +### Quality Evaluation + +Evaluate AI-generated content quality using Claude scoring. Requires [Claude CLI](https://github.com/anthropics/claude-cli). + +**Location**: `Modules/Tests/WordPressIntelligenceTests/` + +```bash +# Quick start +cd Modules/Tests/WordPressIntelligenceTests +make # Show all available commands +make eval # Run full evaluation (all test types) +make eval-quick # Run English excerpt evaluation +make eval TESTS="excerpts" # Run only excerpt tests +make eval TESTS="excerpts tags" # Run excerpt and tag tests +make eval-tags # Evaluate tag suggestions +make eval-summary # Evaluate post summaries +make open # Open latest HTML report +``` + +**Common targets**: +- `make eval` - Run full evaluation for all test types (excerpts, tags, summary) +- `make eval TESTS="excerpts"` - Run only specific test types +- `make eval-quick` - Fast evaluation (English excerpts only) +- `make rebuild-improve` - Regenerate HTML with mock improvements (for UI development) +- `make open` - Open latest evaluation report +- `make help` - Show all available commands + +For advanced options and HTML report development, see: +- `Modules/Tests/WordPressIntelligenceTests/Makefile` +- `Modules/Tests/WordPressIntelligenceTests/lib/DEVELOPMENT.md` + +### Evaluation Output + +Results are saved to `/tmp/WordPressIntelligence-Tests/evaluation-/`: + +- **`evaluation-report.html`** - Interactive report with filtering, sorting, baseline comparison +- **`evaluation-results.json`** - Machine-readable data for CI/CD +- Console output with quick summary + +**HTML Report Features**: +- Sortable columns (test name, status, score, duration) +- Filter by language, status, or comparison results +- Baseline comparison with delta indicators (↑ improved, ↓ regressed, = unchanged) +- Click any test to see detailed scores, generated content, and Claude feedback +- Score distribution dots (●●●) show pass/warn/fail for each excerpt + +### Scoring + +Quality scores use weighted criteria (1-10 scale): + +**Excerpt Generation**: +- Language Match (3.0×), Grammar (2.0×), Relevance (2.0×) - critical factors +- Hook Quality (1.5×), Key Info (1.5×), Length, Style, Standalone, Engagement (1.0× each) +- Diversity: structural, angle, length, lexical variation + +**Pass criteria**: Overall ≥ 7.0 AND no critical failures +**Needs Improvement**: 6.0-6.9 OR any score < 4.0 +**Failed**: Language < 8.0 OR Grammar < 6.0 OR Overall < 6.0 + +*Note: Tag and summary evaluations use different criteria optimized for their use cases.* + +## Extending Tests + +### Adding Test Cases + +1. Add test data to `lib/config.py`: +```python +"new_test_case": TestConfig( + original_content="...", + language="english", + # ... other parameters +) +``` + +2. Update `Makefile` if adding new test type: +```makefile +eval-newtype: + @./lib/evaluate-with-claude.sh --test-type newtype +``` + +### Customizing Evaluation Criteria + +Edit scoring logic in `lib/evaluators.py`. Each test type has its own evaluator class with weighted criteria and thresholds. + +### Developing HTML Report + +For fast iteration on HTML report UI without re-running tests: + +```bash +make rebuild-improve # Regenerate with mock improvements +# Edit lib/evaluation-viewer.html +make rebuild-improve # Instant preview +``` + +See `lib/DEVELOPMENT.md` for complete HTML development workflow. + +## Troubleshooting + +**Tests skipped**: Missing iOS 26 or Apple Intelligence support +**Language issues**: Check prompt in `Sources/WordPressIntelligence/ExcerptGeneration.swift` +**Evaluation fails**: Install/configure Claude CLI: `pip install claude-cli && claude configure` + +See `CLAUDE.md` for project development guidelines. diff --git a/Modules/Sources/WordPressIntelligence/UseCases/PostExcerptGenerator.swift b/Modules/Sources/WordPressIntelligence/UseCases/PostExcerptGenerator.swift new file mode 100644 index 000000000000..a53833c6b6db --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/UseCases/PostExcerptGenerator.swift @@ -0,0 +1,105 @@ +import Foundation +import FoundationModels + +/// Excerpt generation for WordPress posts. +/// +/// Generates multiple excerpt variations for blog posts with customizable +/// length and writing style. Supports session-based usage (for UI with continuity) +/// and one-shot generation (for tests and background tasks). +@available(iOS 26, *) +public struct PostExcerptGenerator { + public var length: ContentLength + public var style: WritingStyle + public var options: GenerationOptions + + public init( + length: ContentLength, + style: WritingStyle, + options: GenerationOptions = GenerationOptions(temperature: 0.7) + ) { + self.length = length + self.style = style + self.options = options + } + + /// Generates excerpts with this configuration. + public func generate(for content: String) async throws -> [String] { + let content = IntelligenceService.extractRelevantText(from: content) + let response = try await makeSession().respond( + to: makePrompt(content: content), + generating: Result.self, + options: options + ) + return response.content.excerpts + } + + /// Creates a language model session configured for excerpt generation. + public func makeSession() -> LanguageModelSession { + LanguageModelSession( + model: .init(guardrails: .permissiveContentTransformations), + instructions: Self.instructions + ) + } + + /// Instructions for the language model session. + public static var instructions: String { + """ + You are helping a WordPress user generate an excerpt for their post or page. + + **Prompt Parameters** + - POST_CONTENT: contents of the post (HTML or plain text) + - TARGET_LENGTH: MANDATORY sentence count (primary) and word count (secondary) for each excerpt + - GENERATION_STYLE: the writing style to follow + + \(IntelligenceService.makeLocaleInstructions()) + + **CRITICAL Requirements (MUST be followed exactly)** + 1. ⚠️ LANGUAGE: Generate excerpts in the SAME language as POST_CONTENT. NO translation. NO defaulting to English. Match input language EXACTLY. + + 2. ⚠️ LENGTH: Each excerpt MUST match the TARGET_LENGTH specification. + - PRIMARY: Match the sentence count (e.g., "1-2 sentences" means write 1 or 2 complete sentences) + - SECONDARY: Stay within the word count range (accommodates language differences) + - Write complete sentences only. Count sentences after writing. + - VERIFY both sentence and word counts before responding. + + 3. ⚠️ STYLE: Follow the GENERATION_STYLE exactly (witty, professional, engaging, etc.) + + **Excerpt best practices** + - Follow WordPress ecosystem best practices for post excerpts + - Include the post's main value proposition + - Use active voice (avoid "is", "are", "was", "were" when possible) + - End with implicit promise of more information (no ellipsis) + - Include strategic keywords naturally + - Write independently from the introduction – don't duplicate the opening paragraph + - Make excerpts work as standalone copy for search results, social media, and email + """ + } + + /// Creates a prompt for this excerpt configuration. + public func makePrompt(content: String) -> String { + """ + Generate EXACTLY 3 different excerpts for the given post. + + TARGET_LENGTH: \(length.promptModifier) + CRITICAL: Write \(length.sentenceRange.lowerBound)-\(length.sentenceRange.upperBound) complete sentences. Stay within \(length.wordRange.lowerBound)-\(length.wordRange.upperBound) words. + + GENERATION_STYLE: \(style.promptModifier) + + POST_CONTENT: + \(content) + """ + } + + /// Prompt for generating additional excerpt options. + public static var loadMorePrompt: String { + "Generate 3 additional excerpts following the same TARGET_LENGTH and GENERATION_STYLE requirements" + } + + // MARK: - Result Type + + @Generable + public struct Result { + @Guide(description: "Suggested post excerpts", .count(3)) + public var excerpts: [String] + } +} diff --git a/Modules/Sources/WordPressIntelligence/UseCases/PostSummaryGenerator.swift b/Modules/Sources/WordPressIntelligence/UseCases/PostSummaryGenerator.swift new file mode 100644 index 000000000000..d44ca1627990 --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/UseCases/PostSummaryGenerator.swift @@ -0,0 +1,81 @@ +import Foundation +import FoundationModels + +/// Post summarization for WordPress content. +/// +/// Generates concise summaries that capture the main points and key information +/// from WordPress post content in the same language as the source. +/// +/// Example usage: +/// ```swift +/// let summary = PostSummary() +/// let result = try await summary.generate(content: postContent) +/// ``` +@available(iOS 26, *) +public struct PostSummaryGenerator { + public var options: GenerationOptions + + public init(options: GenerationOptions = GenerationOptions(temperature: 0.3)) { + self.options = options + } + + /// Generate a summary for the given post content. + /// + /// - Parameter content: The post content to summarize (HTML or plain text) + /// - Returns: A concise summary in the same language as the source + /// - Throws: If the language model session fails + public func generate(content: String) async throws -> String { + let session = makeSession() + let prompt = makePrompt(content: content) + return try await session.respond(to: prompt).content + } + + /// Creates a language model session configured for post summarization. + /// + /// - Returns: Configured session with instructions + public func makeSession() -> LanguageModelSession { + LanguageModelSession( + model: .init(guardrails: .permissiveContentTransformations), + instructions: Self.instructions + ) + } + + /// Instructions for the language model on how to generate summaries. + public static var instructions: String { + """ + You are helping a WordPress user understand the content of a post. + Generate a concise summary that captures the main points and key information. + The summary should be clear, informative, and written in a neutral tone. + + \(IntelligenceService.makeLocaleInstructions()) + + Do not include anything other than the summary in the response. + """ + } + + /// Builds the prompt for summarizing post content. + /// + /// - Parameter content: The post content to summarize + /// - Returns: Formatted prompt string + public func makePrompt(content: String) -> String { + let extractedContent = IntelligenceService.extractRelevantText(from: content, ratio: 0.8) + + return """ + Summarize the following post: + + \(extractedContent) + """ + } +} + +@available(iOS 26, *) +extension IntelligenceService { + /// Post summarization for WordPress content. + /// + /// - Parameter content: The post content to summarize + /// - Returns: A concise summary + /// - Throws: If summarization fails + public func summarize(content: String) async throws -> String { + try await PostSummaryGenerator().generate(content: content) + } +} diff --git a/Modules/Sources/WordPressIntelligence/UseCases/SupportTicketSummaryGenerator.swift b/Modules/Sources/WordPressIntelligence/UseCases/SupportTicketSummaryGenerator.swift new file mode 100644 index 000000000000..1c3865cda358 --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/UseCases/SupportTicketSummaryGenerator.swift @@ -0,0 +1,44 @@ +import Foundation +import FoundationModels + +/// Support ticket summarization. +/// +/// Generates short, concise titles (fewer than 10 words) for support +/// conversations based on the opening message. +@available(iOS 26, *) +public enum SupportTicketSummaryGenerator { + public static func execute(content: String) async throws -> String { + let instructions = """ + You are helping a user by summarizing their support request down to a single sentence + with fewer than 10 words. + + The summary should be clear, informative, and written in a neutral tone. + You MUST generate the summary in the same language as the support request. + + Do not include anything other than the summary in the response. + """ + + let session = LanguageModelSession( + model: .init(guardrails: .permissiveContentTransformations), + instructions: instructions + ) + + let prompt = """ + Give me an appropriate conversation title for the following opening message of the conversation: + + \(content) + """ + + return try await session.respond( + to: prompt, + generating: Result.self, + options: GenerationOptions(temperature: 1.0) + ).content.title + } + + @Generable + struct Result { + @Guide(description: "The conversation title") + var title: String + } +} diff --git a/Modules/Sources/WordPressIntelligence/UseCases/TagSuggestionGenerator.swift b/Modules/Sources/WordPressIntelligence/UseCases/TagSuggestionGenerator.swift new file mode 100644 index 000000000000..1881265436d8 --- /dev/null +++ b/Modules/Sources/WordPressIntelligence/UseCases/TagSuggestionGenerator.swift @@ -0,0 +1,107 @@ +import Foundation +import FoundationModels +import WordPressShared + +/// Tag suggestion for WordPress posts. +/// +/// Generates relevant tags based on post content and existing site tags, +/// matching the language and formatting pattern of existing tags. +@available(iOS 26, *) +public struct TagSuggestionGenerator { + public var options: GenerationOptions + + public init(options: GenerationOptions = GenerationOptions(temperature: 0.2)) { + self.options = options + } + + /// Generates tags for a WordPress post. + public func generate(post: String, siteTags: [String] = [], postTags: [String] = []) async throws -> [String] { + let startTime = CFAbsoluteTimeGetCurrent() + + let response = try await makeSession().respond( + to: makePrompt(post: post, siteTags: siteTags, postTags: postTags), + generating: Result.self, + options: options + ) + + WPLogInfo("TagSuggestion executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms") + + let existingPostTags = Set(postTags) + return response.content.tags + .deduplicated() + .filter { !existingPostTags.contains($0) } + } + + /// Creates a language model session configured for tag suggestion. + public func makeSession() -> LanguageModelSession { + LanguageModelSession( + model: .init(guardrails: .permissiveContentTransformations), + instructions: Self.instructions + ) + } + + /// Instructions for the language model session. + public static var instructions: String { + """ + You are helping a WordPress user add tags to a post or a page. + + **Parameters** + - POST_CONTENT: contents of the post (HTML or plain text) + - SITE_TAGS: case-sensitive comma-separated list of the existing tags used elsewhere on the site (not always relevant to the post) + - EXISTING_POST_TAGS: tags already added to the post + + \(IntelligenceService.makeLocaleInstructions()) + + **Steps** + - 1. Identify the specific formatting pattern used (e.g., lowercase with underscores, capitalized words with spaces, etc) + - 2. Identify the language used in SITE_TAGS and POST_CONTENT + - 3. Generate a list of relevant suggested tags based on POST_CONTENT and SITE_TAGS relevant to the content. + + **Requirements** + - You MUST generate tags in the same language as SITE_TAGS and POST_CONTENT + - Tags MUST match the formatting pattern and language of existing tags + - Do not include any tags from EXISTING_POST_TAGS + - If there are no relevant suggestions, returns an empty list + - Do not produce any output other than the final list of tags + """ + } + + /// Creates a prompt for tag suggestion with the given parameters. + public func makePrompt(post: String, siteTags: [String], postTags: [String]) -> String { + // Limit siteTags and content size to respect context window + let siteTags = siteTags.prefix(50) + let post = IntelligenceService.extractRelevantText(from: post) + + return """ + Suggest tags for a post. + + POST_CONTENT: ''' + \(post) + ''' + + SITE_TAGS: '\(siteTags.joined(separator: ", "))' + + EXISTING_POST_TAGS: '\(postTags.joined(separator: ", "))' + """ + } + + /// Prompt for generating additional tag suggestions. + public static var loadMorePrompt: String { + "Generate additional relevant tags following the same format and language requirements" + } + + // MARK: - Result Type + + @Generable + public struct Result { + @Guide(description: "Newly generated tags following the identified format", .count(5...10)) + public var tags: [String] + } +} + +private extension Array where Element: Hashable { + func deduplicated() -> [Element] { + var seen = Set() + return filter { seen.insert($0).inserted } + } +} diff --git a/Modules/Sources/WordPressShared/Intelligence/IntelligenceUtilities.swift b/Modules/Sources/WordPressIntelligence/Utilities/ContentExtractor.swift similarity index 98% rename from Modules/Sources/WordPressShared/Intelligence/IntelligenceUtilities.swift rename to Modules/Sources/WordPressIntelligence/Utilities/ContentExtractor.swift index 47406b0ed1e9..e6c0c1bf522b 100644 --- a/Modules/Sources/WordPressShared/Intelligence/IntelligenceUtilities.swift +++ b/Modules/Sources/WordPressIntelligence/Utilities/ContentExtractor.swift @@ -1,7 +1,7 @@ import Foundation import SwiftSoup -public struct IntelligenceUtilities { +public enum ContentExtractor { /// Extracts semantically meaningful content from HTML for LLM processing. /// /// Optimized for language models by: diff --git a/Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift b/Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift deleted file mode 100644 index 66f386c49c9e..000000000000 --- a/Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift +++ /dev/null @@ -1,179 +0,0 @@ -import Foundation -import FoundationModels - -@available(iOS 26, *) -public actor IntelligenceService { - /// A single token corresponds to three or four characters in languages like - /// English, Spanish, or German, and one token per character in languages like - /// Japanese, Chinese, or Korean. In a single session, the sum of all tokens - /// in the instructions, all prompts, and all outputs count toward the context window size. - /// - /// https://developer.apple.com/documentation/foundationmodels/generating-content-and-performing-tasks-with-foundation-models#Consider-context-size-limits-per-session - static let contextSizeLimit = 4096 - - public nonisolated static var isSupported: Bool { - LanguageModelHelper.isSupported - } - - public init() {} - - /// Suggests tags for a WordPress post. - /// - /// - Parameters: - /// - post: The content of the WordPress post. - /// - siteTags: An array of existing tags used elsewhere on the site. - /// - postTags: An array of tags already assigned to the post. - /// - /// - Returns: An array of suggested tags. - public func suggestTags(post: String, siteTags: [String] = [], postTags: [String] = []) async throws -> [String] { - let startTime = CFAbsoluteTimeGetCurrent() - - // We have to be mindful of the content size limit, so we - // only support a subset of tags, preamptively remove Gutenberg tags - // from the content, and limit the content size. - - // A maximum of 500 characters assuming 10 characters per - let siteTags = siteTags.prefix(50) - let post = extractRelevantText(from: post) - - try Task.checkCancellation() - - // Notes: - // - It was critical to add "case-sensitive" as otherwise it would ignore - // case sensitivity and pick the wrong output format. - // - The lowered temperature helped improved the accuracy. - // - `useCase: .contentTagging` is not recommended for arbitraty hashtags - - let instructions = """ - You are helping a WordPress user add tags to a post or a page. - - **Parameters** - - POST_CONTENT: contents of the post (HTML or plain text) - - SITE_TAGS: case-sensitive comma-separated list of the existing tags used elsewhere on the site (not always relevant to the post) - - EXISTING_POST_TAGS: tags already added to the post - - **Steps** - - 1. Identify the specific formatting pattern used (e.g., lowercase with underscores, capitalized words with spaces, etc) - - 2. Generate a list of ten most relevant suggested tags based on POST_CONTENT and SITE_TAGS relevant to the content. - - **Requirements** - - Do not include any tags from EXISTING_POST_TAGS - - If there are no relevant suggestions, returns an empty list - - Do not produce any output other than the final list of tag - """ - - let session = LanguageModelSession( - model: .init(guardrails: .permissiveContentTransformations), - instructions: instructions - ) - - let prompt = """ - Suggest up to ten tags for a post. - - POST_CONTENT: ''' - \(post) - ''' - - SITE_TAGS: '\(siteTags.joined(separator: ", "))' - - EXISTING_POST_TAGS: '\(postTags.joined(separator: ", "))' - """ - - let response = try await session.respond( - to: prompt, - generating: SuggestedTagsResult.self, - options: GenerationOptions(temperature: 0.2) - ) - - WPLogInfo("IntelligenceService.suggestTags executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms") - - let existingPostTags = Set(postTags) - return response.content.tags - .deduplicated() - .filter { !existingPostTags.contains($0) } - } - - /// Summarizes a WordPress post. - /// - /// - Parameter content: The content of the WordPress post (HTML or plain text). - /// - Returns: An async stream of partial summaries as they are generated. - public func summarizePost(content: String) -> LanguageModelSession.ResponseStream { - let content = extractRelevantText(from: content, ratio: 0.8) - - let instructions = """ - You are helping a WordPress user understand the content of a post. - Generate a concise summary that captures the main points and key information. - The summary should be clear, informative, and written in a neutral tone. - - Do not include anything other than the summary in the response. - """ - - let session = LanguageModelSession( - model: .init(guardrails: .permissiveContentTransformations), - instructions: instructions - ) - - let prompt = """ - Summarize the following post: - - \(content) - """ - - return session.streamResponse(to: prompt) - } - - public func summarizeSupportTicket(content: String) async throws -> String { - let instructions = """ - You are helping a user by summarizing their support request down to a single sentence - with fewer than 10 words. - - The summary should be clear, informative, and written in a neutral tone. - - Do not include anything other than the summary in the response. - """ - - let session = LanguageModelSession( - model: .init(guardrails: .permissiveContentTransformations), - instructions: instructions - ) - - let prompt = """ - Give me an appropriate conversation title for the following opening message of the conversation: - - \(content) - """ - - return try await session.respond( - to: prompt, - generating: SuggestedConversationTitle.self, - options: GenerationOptions(temperature: 1.0) - ).content.title - } - - public nonisolated func extractRelevantText(from post: String, ratio: CGFloat = 0.6) -> String { - let extract = try? IntelligenceUtilities.extractRelevantText(from: post) - let postSizeLimit = Double(IntelligenceService.contextSizeLimit) * ratio - return String((extract ?? post).prefix(Int(postSizeLimit))) - } -} - -private extension Array where Element: Hashable { - func deduplicated() -> [Element] { - var seen = Set() - return filter { seen.insert($0).inserted } - } -} - -@available(iOS 26, *) -@Generable -private struct SuggestedTagsResult { - @Guide(description: "Newly generated tags following the identified format") - var tags: [String] -} - -@available(iOS 26, *) -@Generable -private struct SuggestedConversationTitle { - @Guide(description: "The conversation title") - var title: String -} diff --git a/Modules/Sources/WordPressShared/Intelligence/LanguageModelHelper.swift b/Modules/Sources/WordPressShared/Intelligence/LanguageModelHelper.swift deleted file mode 100644 index 663a2cb3c080..000000000000 --- a/Modules/Sources/WordPressShared/Intelligence/LanguageModelHelper.swift +++ /dev/null @@ -1,142 +0,0 @@ -import Foundation -import FoundationModels - -public enum LanguageModelHelper { - public static var isSupported: Bool { - guard #available(iOS 26, *) else { return false } - switch SystemLanguageModel.default.availability { - case .available: - return true - case .unavailable(let reason): - switch reason { - case .appleIntelligenceNotEnabled, .modelNotReady: - return true - case .deviceNotEligible: - return false - @unknown default: - return false - } - } - } - - public static var generateExcerptInstructions: String { - """ - Generate exactly 3 excerpts for the blog post and follow the instructions from the prompt regarding the length and the style. - - **Paramters** - - POST_CONTENT: contents of the post (HTML or plain text) - - GENERATED_CONTENT_LENGTH: the length of the generated content - - GENERATION_STYLE: the writing style to follow - - **Requirements** - - Each excerpt must follow the provided GENERATED_CONTENT_LENGTH and use GENERATION_STYLE - - **Excerpt best practices** - - Follow the best practices for post excerpts esteblished in the WordPress ecosystem - - Include the post's main value proposition - - Use active voice (avoid "is", "are", "was", "were" when possible) - - End with implicit promise of more information - - Do not use ellipsis (...) at the end - - Focus on value, not summary - - Include strategic keywords naturally - - Write independently from the introduction – excerpt shouldn't just duplicate your opening paragraph. While your introduction eases readers into the topic, your excerpt needs to work as standalone copy that makes sense out of context—whether it appears in search results, social media cards, or email newsletters. - """ - } - - public static func makeGenerateExcerptPrompt( - content: String, - length: GeneratedContentLength, - style: GenerationStyle - ) -> String { - """ - Generate three different excerpts for the given post and parameters - - GENERATED_CONTENT_LENGTH: \(length.promptModifier) - - GENERATION_STYLE: \(style.promptModifier) - - POST_CONTENT: ''' - \(content) - """ - } - - public static var generateMoreOptionsPrompt: String { - "Generate additional three options" - } -} - -public enum GenerationStyle: String, CaseIterable, RawRepresentable { - case engaging - case conversational - case witty - case formal - case professional - - public var displayName: String { - switch self { - case .engaging: - NSLocalizedString("generation.style.engaging", value: "Engaging", comment: "AI generation style") - case .conversational: - NSLocalizedString("generation.style.conversational", value: "Conversational", comment: "AI generation style") - case .witty: - NSLocalizedString("generation.style.witty", value: "Witty", comment: "AI generation style") - case .formal: - NSLocalizedString("generation.style.formal", value: "Formal", comment: "AI generation style") - case .professional: - NSLocalizedString("generation.style.professional", value: "Professional", comment: "AI generation style") - } - } - - public var promptModifier: String { - "\(rawValue) (\(promptModifierDetails))" - } - - var promptModifierDetails: String { - switch self { - case .engaging: "engaging and compelling tone" - case .witty: "witty, creative, entertaining" - case .conversational: "friendly and conversational tone" - case .formal: "formal and academic tone" - case .professional: "professional and polished tone" - } - } -} - -public enum GeneratedContentLength: Int, CaseIterable, RawRepresentable { - case short - case medium - case long - - public var displayName: String { - switch self { - case .short: - NSLocalizedString("generation.length.short", value: "Short", comment: "Generated content length (needs to be short)") - case .medium: - NSLocalizedString("generation.length.medium", value: "Medium", comment: "Generated content length (needs to be short)") - case .long: - NSLocalizedString("generation.length.long", value: "Long", comment: "Generated content length (needs to be short)") - } - } - - public var trackingName: String { name } - - public var promptModifier: String { - "\(wordRange) words" - } - - private var name: String { - switch self { - case .short: "short" - case .medium: "medium" - case .long: "long" - } - } - - private var wordRange: String { - switch self { - case .short: "20-40" - case .medium: "50-70" - case .long: "120-180" - } - } -} diff --git a/Modules/Tests/WordPressIntelligenceTests/.gitignore b/Modules/Tests/WordPressIntelligenceTests/.gitignore new file mode 100644 index 000000000000..fb8aad57ef50 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/.gitignore @@ -0,0 +1,3 @@ +lib/__pycache__/ +*.pyc +*.pyo diff --git a/Modules/Tests/WordPressSharedTests/IntelligenceUtilitiesTests.swift b/Modules/Tests/WordPressIntelligenceTests/ContentExtractorTests.swift similarity index 95% rename from Modules/Tests/WordPressSharedTests/IntelligenceUtilitiesTests.swift rename to Modules/Tests/WordPressIntelligenceTests/ContentExtractorTests.swift index 657644c0e0fd..ccec0ddb38db 100644 --- a/Modules/Tests/WordPressSharedTests/IntelligenceUtilitiesTests.swift +++ b/Modules/Tests/WordPressIntelligenceTests/ContentExtractorTests.swift @@ -1,9 +1,9 @@ import Testing -@testable import WordPressShared +@testable import WordPressIntelligence -struct IntelligenceUtilitiesTests { +struct ContentExtractorTests { @Test func extractRelevantText() throws { - let text = try IntelligenceUtilities.extractRelevantText(from: IntelligenceUtilities.post) + let text = try ContentExtractor.extractRelevantText(from: ContentExtractor.post) #expect(text == """

The Art of Making Perfect Sourdough Bread at Home

@@ -52,7 +52,7 @@ struct IntelligenceUtilitiesTests { /// Blockquote contain nested block and the implementation should account for that. @Test func blockquotes() throws { - let text = try IntelligenceUtilities.extractRelevantText(from: """ + let text = try ContentExtractor.extractRelevantText(from: """

Welcome to WordPress! This is your first post. Edit or delete it to take the first step in your blogging journey.

@@ -71,13 +71,13 @@ struct IntelligenceUtilitiesTests { } @Test func extractRelevantTextFromPlainText() throws { - let text = try IntelligenceUtilities.extractRelevantText(from: "This is a plain text post") + let text = try ContentExtractor.extractRelevantText(from: "This is a plain text post") #expect(text == "This is a plain text post") } } -extension IntelligenceUtilities { +extension ContentExtractor { static let post = """

The Art of Making Perfect Sourdough Bread at Home

diff --git a/Modules/Tests/WordPressIntelligenceTests/Helpers/ExcerptTestOutput.swift b/Modules/Tests/WordPressIntelligenceTests/Helpers/ExcerptTestOutput.swift new file mode 100644 index 000000000000..8f89103ccdcd --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/Helpers/ExcerptTestOutput.swift @@ -0,0 +1,63 @@ +import Foundation +import NaturalLanguage + +/// Structured output for excerpt tests that can be consumed by evaluation scripts. +struct ExcerptTestOutput: Codable { + let testName: String + let language: String + let length: String + let style: String + let originalContent: String + let excerpts: [String] + let duration: Double + let timestamp: String + + /// Convenience initializer that accepts ExcerptTestCaseParameters and Duration. + init( + parameters: ExcerptTestCaseParameters, + excerpts: [String], + duration: Duration + ) { + self.testName = parameters.testDescription + self.language = parameters.data.languageCode.rawValue + self.length = parameters.length.displayName + self.style = parameters.style.displayName + self.originalContent = parameters.data.content + self.excerpts = excerpts + self.duration = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + self.timestamp = ISO8601DateFormatter().string(from: Date()) + } + + /// Record test output to console for extraction and print formatted results. + /// Emits base64-encoded JSON between markers for reliable parsing. + /// This output can be extracted and evaluated by external tools. + func recordAndPrint(parameters: ExcerptTestCaseParameters, duration: Duration) throws { + // Always record structured output for evaluation script + try record() + + // Print formatted results for readability + TestHelpers.printExcerptResults( + parameters: parameters, + excerpts: excerpts, + duration: duration + ) + } + + /// Record test output to console for extraction. + /// Emits base64-encoded JSON between markers for reliable parsing. + /// This output can be extracted and evaluated by external tools. + private func record() throws { + // Encode to JSON + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let jsonData = try encoder.encode(self) + + // Base64 encode for safe console transmission + let base64String = jsonData.base64EncodedString() + + // Emit structured output with markers + print("__EXCERPT_OUTPUT_START__") + print(base64String) + print("__EXCERPT_OUTPUT_END__") + } +} diff --git a/Modules/Tests/WordPressIntelligenceTests/Helpers/SummaryTestOutput.swift b/Modules/Tests/WordPressIntelligenceTests/Helpers/SummaryTestOutput.swift new file mode 100644 index 000000000000..3b0130dc7b93 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/Helpers/SummaryTestOutput.swift @@ -0,0 +1,61 @@ +import Foundation +import NaturalLanguage + +/// Structured output for post summary tests that can be consumed by evaluation scripts. +struct SummaryTestOutput: Codable { + let testType: String + let testName: String + let language: String + let originalContent: String + let summary: String + let duration: Double + let timestamp: String + + /// Convenience initializer that accepts SummaryTestCaseParameters and Duration. + init( + parameters: SummaryTestCaseParameters, + summary: String, + duration: Duration + ) { + self.testType = "post-summary" + self.testName = parameters.testDescription + self.language = parameters.data.languageCode.rawValue + self.originalContent = parameters.data.content + self.summary = summary + self.duration = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + self.timestamp = ISO8601DateFormatter().string(from: Date()) + } + + /// Record test output to console for extraction and print formatted results. + /// Emits base64-encoded JSON between markers for reliable parsing. + /// This output can be extracted and evaluated by external tools. + func recordAndPrint(parameters: SummaryTestCaseParameters, duration: Duration) throws { + // Always record structured output for evaluation script + try record() + + // Print formatted results for readability + TestHelpers.printSummaryResults( + parameters: parameters, + summary: summary, + duration: duration + ) + } + + /// Record test output to console for extraction. + /// Emits base64-encoded JSON between markers for reliable parsing. + /// This output can be extracted and evaluated by external tools. + private func record() throws { + // Encode to JSON + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let jsonData = try encoder.encode(self) + + // Base64 encode for safe console transmission + let base64String = jsonData.base64EncodedString() + + // Emit structured output with markers + print("__SUMMARY_OUTPUT_START__") + print(base64String) + print("__SUMMARY_OUTPUT_END__") + } +} diff --git a/Modules/Tests/WordPressIntelligenceTests/Helpers/TagTestOutput.swift b/Modules/Tests/WordPressIntelligenceTests/Helpers/TagTestOutput.swift new file mode 100644 index 000000000000..9945539d6d74 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/Helpers/TagTestOutput.swift @@ -0,0 +1,65 @@ +import Foundation +import NaturalLanguage + +/// Structured output for tag suggestion tests that can be consumed by evaluation scripts. +struct TagTestOutput: Codable { + let testType: String + let testName: String + let language: String + let originalContent: String + let siteTags: [String] + let existingPostTags: [String] + let tags: [String] + let duration: Double + let timestamp: String + + /// Convenience initializer that accepts TagTestCaseParameters and Duration. + init( + parameters: TagTestCaseParameters, + tags: [String], + duration: Duration + ) { + self.testType = "tag-suggestion" + self.testName = parameters.testDescription + self.language = parameters.data.languageCode.rawValue + self.originalContent = parameters.data.content + self.siteTags = parameters.siteTags + self.existingPostTags = parameters.postTags + self.tags = tags + self.duration = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + self.timestamp = ISO8601DateFormatter().string(from: Date()) + } + + /// Record test output to console for extraction and print formatted results. + /// Emits base64-encoded JSON between markers for reliable parsing. + /// This output can be extracted and evaluated by external tools. + func recordAndPrint(parameters: TagTestCaseParameters, duration: Duration) throws { + // Always record structured output for evaluation script + try record() + + // Print formatted results for readability + TestHelpers.printTagResults( + parameters: parameters, + tags: tags, + duration: duration + ) + } + + /// Record test output to console for extraction. + /// Emits base64-encoded JSON between markers for reliable parsing. + /// This output can be extracted and evaluated by external tools. + private func record() throws { + // Encode to JSON + let encoder = JSONEncoder() + encoder.outputFormatting = [.sortedKeys] + let jsonData = try encoder.encode(self) + + // Base64 encode for safe console transmission + let base64String = jsonData.base64EncodedString() + + // Emit structured output with markers + print("__TAG_OUTPUT_START__") + print(base64String) + print("__TAG_OUTPUT_END__") + } +} diff --git a/Modules/Tests/WordPressIntelligenceTests/Helpers/TestData.swift b/Modules/Tests/WordPressIntelligenceTests/Helpers/TestData.swift new file mode 100644 index 000000000000..82e23a3d1b72 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/Helpers/TestData.swift @@ -0,0 +1,450 @@ +import Foundation +import NaturalLanguage + +/// Test content with title and body for intelligence service tests. +struct TestContent { + let title: String + let content: String + let languageCode: NLLanguage +} + +/// Shared test data for intelligence service tests. +/// +/// This enum provides sample content in multiple languages for testing +/// excerpt generation, post summarization, and tag suggestion features. +enum TestData { + // MARK: - English Content + + static let englishPostWithHTML = TestContent( + title: "English Post with HTML", + content: """ + +

The Art of Sourdough Bread Making

+ + + +

Sourdough bread has experienced a remarkable revival in recent years, with home bakers + around the world rediscovering this ancient craft. The natural fermentation process creates + a distinctive tangy flavor and numerous health benefits that make it worth the extra effort.

+ + + +

Essential Ingredients

+ + + +
    +
  • Active sourdough starter
  • +
  • 500g bread flour
  • +
  • 350ml filtered water
  • +
  • 10g sea salt
  • +
  • Optional: seeds or grains for texture
  • +
+ + + +

The key to successful sourdough lies in maintaining a healthy starter culture and + understanding the fermentation process. Temperature and timing are crucial factors that + will determine the final texture and flavor of your bread.

+ + """, + languageCode: .english + ) + + static let veryShortEnglishContent = TestContent( + title: "Very Short English Content", + content: "Artificial intelligence is transforming our world in unprecedented ways.", + languageCode: .english + ) + + // MARK: - Spanish Content + + static let spanishPost = TestContent( + title: "Spanish Post", + content: """ + La paella valenciana es uno de los platos más emblemáticos de la gastronomía española. + Originaria de Valencia, esta receta tradicional combina arroz, azafrán, y una variedad + de ingredientes que pueden incluir pollo, conejo, judías verdes, y garrofón. + + La clave para una paella perfecta está en el sofrito inicial y en el punto exacto del arroz. + El azafrán no solo aporta ese característico color dorado, sino también un sabor único + e inconfundible. + + Es importante utilizar un buen caldo casero y arroz de calidad, preferiblemente de la + variedad bomba o senia. El fuego debe ser fuerte al principio y suave al final para + conseguir el socarrat, esa capa crujiente de arroz que se forma en el fondo de la paellera. + """, + languageCode: .spanish + ) + + static let spanishReaderArticle = TestContent( + title: "Spanish Reader Article", + content: """ + El cambio climático está afectando de manera significativa a los ecosistemas marinos + del Mediterráneo. Científicos del CSIC han documentado un aumento de 2 grados en la + temperatura media del agua durante los últimos 30 años, lo que ha provocado cambios + en las rutas migratorias de varias especies de peces y la proliferación de especies + invasoras procedentes de aguas más cálidas. + """, + languageCode: .spanish + ) + + // MARK: - English Content + + static let englishTechPost = TestContent( + title: "English Tech Post", + content: """ + Quantum computing represents a paradigm shift in how we approach computational problems. Unlike + classical computers that use bits (0s and 1s), quantum computers leverage qubits that can exist + in superposition, simultaneously representing multiple states. + + This fundamental difference enables quantum computers to tackle problems that are intractable + for classical machines. Drug discovery, cryptography, optimization, and climate modeling are + just a few domains poised for revolutionary breakthroughs. + + However, significant challenges remain. Quantum systems are incredibly fragile, requiring + near-absolute-zero temperatures and isolation from environmental interference. Error correction + is another major hurdle, as quantum states are prone to decoherence. + """, + languageCode: .english + ) + + static let englishAcademicPost = TestContent( + title: "English Academic Post", + content: """ + The phenomenon of linguistic relativity, often referred to as the Sapir-Whorf hypothesis, + posits that the structure of a language influences its speakers' worldview and cognition. + While the strong version of this hypothesis has been largely discredited, contemporary research + suggests more nuanced relationships between language and thought. + + Recent studies in cognitive linguistics have demonstrated that language can indeed affect + perception and categorization, particularly in domains like color perception, spatial reasoning, + and temporal cognition. However, these effects are context-dependent and vary significantly + across different cognitive domains. + + Cross-linguistic research continues to provide valuable insights into the universal and + language-specific aspects of human cognition, challenging researchers to refine their + theoretical frameworks and methodological approaches. + """, + languageCode: .english + ) + + static let englishStoryPost = TestContent( + title: "English Story Post", + content: """ + The old lighthouse keeper had seen many storms in his forty years tending the beacon, but + none quite like the tempest that rolled in that October evening. Dark clouds gathered on + the horizon like an invading army, their edges tinged with an unsettling green hue. + + As the first drops of rain pelted the lighthouse windows, Magnus checked the lamp one final + time. The beam cut through the gathering darkness, a lifeline for any vessels brave or foolish + enough to be out on such a night. He'd heard the coastguard warnings on the radio—winds + exceeding 90 miles per hour, waves reaching heights of 30 feet. + + Down in the keeper's quarters, Magnus brewed strong coffee and settled into his worn leather + chair. Outside, the wind howled like a wounded beast, but within these thick stone walls, + he felt safe. This lighthouse had withstood two centuries of nature's fury; it would stand + through one more night. + """, + languageCode: .english + ) + + static let englishPost = TestContent( + title: "English Post", + content: """ + Sourdough bread has experienced a remarkable revival in recent years, with home bakers + around the world rediscovering this ancient craft. The natural fermentation process + creates a distinctive tangy flavor and numerous health benefits. + + The key to successful sourdough lies in maintaining a healthy starter culture. This + living mixture of flour and water harbors wild yeast and beneficial bacteria that + work together to leaven the bread and develop complex flavors. + + Temperature and timing are crucial factors. The fermentation process can take anywhere + from 12 to 24 hours, depending on ambient temperature and the activity of your starter. + """, + languageCode: .english + ) + + static let englishReaderArticle = TestContent( + title: "English Reader Article", + content: """ + Recent advances in quantum computing have brought us closer to solving complex problems + that are impossible for classical computers. Google's quantum processor achieved + quantum supremacy by performing a calculation in 200 seconds that would take the world's + fastest supercomputer 10,000 years to complete. However, practical applications for + everyday computing are still years away. + """, + languageCode: .english + ) + + // MARK: - French Content + + static let frenchPost = TestContent( + title: "French Post", + content: """ + La cuisine française est reconnue mondialement pour sa finesse et sa diversité. + Du coq au vin bourguignon au délicieux cassoulet du Sud-Ouest, chaque région possède + ses spécialités qui racontent une histoire culinaire unique. + + Les techniques de base de la cuisine française, comme le mirepoix, le roux, et les + cinq sauces mères, constituent le fondement de nombreuses préparations classiques. + Ces méthodes transmises de génération en génération permettent de créer des plats + d'une grande complexité et raffinement. + + L'utilisation d'ingrédients frais et de saison est primordiale. Les marchés locaux + offrent une abondance de produits qui inspirent les chefs et les cuisiniers amateurs. + """, + languageCode: .french + ) + + // MARK: - Japanese Content + + static let japanesePost = TestContent( + title: "Japanese Post", + content: """ + 日本料理の基本である出汁は、昆布と鰹節から作られる伝統的な調味料です。 + この旨味の素は、味噌汁、煮物、そして様々な料理の基礎となっています。 + + 正しい出汁の取り方は、まず昆布を水に浸して弱火でゆっくりと加熱します。 + 沸騰直前に昆布を取り出し、その後鰹節を加えて数分間煮出します。 + + 良質な出汁を使うことで、料理全体の味わいが格段に向上します。 + インスタント出汁も便利ですが、本格的な料理には手作りの出汁が欠かせません。 + """, + languageCode: .japanese + ) + + // MARK: - German Content + + static let germanTechPost = TestContent( + title: "German Tech Post", + content: """ + Die deutsche Automobilindustrie steht vor einem beispiellosen Wandel. Der Übergang von + Verbrennungsmotoren zu Elektroantrieben erfordert nicht nur technologische Innovation, + sondern auch eine grundlegende Neuausrichtung der gesamten Wertschöpfungskette. + + Traditionelle Zulieferer müssen sich anpassen oder riskieren, obsolet zu werden. Gleichzeitig + entstehen neue Geschäftsmodelle rund um Batterietechnologie, Ladeinfrastruktur und + Software-definierte Fahrzeuge. Die Frage ist nicht mehr, ob dieser Wandel kommt, sondern + wie schnell deutsche Unternehmen sich anpassen können, um ihre führende Position in der + globalen Automobilbranche zu behalten. + """, + languageCode: .german + ) + + // MARK: - Mandarin Content + + static let mandarinPost = TestContent( + title: "Mandarin Post", + content: """ + 中国茶文化有着数千年的悠久历史,是中华文明的重要组成部分。从绿茶到红茶, + 从乌龙茶到普洱茶,每一种茶都有其独特的制作工艺和品鉴方法。 + + 茶道不仅仅是一种饮茶的方式,更是一种生活态度和精神追求。通过泡茶、品茶的过程, + 人们可以修身养性,体会宁静致远的境界。 + + 好的茶叶需要适宜的水温和冲泡时间。绿茶适合用80度左右的水温,而红茶则需要 + 95度以上的沸水。掌握这些细节,才能充分释放茶叶的香气和味道。 + """, + languageCode: .simplifiedChinese + ) + + // MARK: - Hindi Content + + static let hindiPost = TestContent( + title: "Hindi Post", + content: """ + योग भारतीय संस्कृति की एक प्राचीन परंपरा है जो शारीरिक, मानसिक और आध्यात्मिक स्वास्थ्य को बढ़ावा देती है। + आसन, प्राणायाम और ध्यान के माध्यम से, योग हमें संतुलित और स्वस्थ जीवन जीने में मदद करता है। + + नियमित योग अभ्यास से तनाव कम होता है, मांसपेशियां मजबूत होती हैं, और मन शांत रहता है। + सूर्य नमस्कार, शवासन, और पद्मासन जैसे आसन शुरुआती लोगों के लिए बहुत उपयोगी हैं। + + योग केवल व्यायाम नहीं है, बल्कि यह जीवन जीने की एक कला है। प्रतिदिन कुछ मिनट योग करने से + जीवन की गुणवत्ता में उल्लेखनीय सुधार हो सकता है। + """, + languageCode: .hindi + ) + + // MARK: - Russian Content + + static let russianPost = TestContent( + title: "Russian Post", + content: """ + Русская литература золотого века подарила миру величайшие произведения, которые + продолжают вдохновлять читателей по всему свету. Толстой, Достоевский, Чехов и + Пушкин создали произведения, исследующие глубины человеческой души. + + Эти авторы не просто рассказывали истории, они поднимали фундаментальные вопросы + о смысле жизни, морали, и человеческой природе. Их произведения остаются актуальными + и сегодня, предлагая читателям глубокие размышления о вечных темах. + + Чтение классической русской литературы — это путешествие в мир сложных характеров, + философских идей и богатого культурного наследия. Каждое произведение открывает + новые горизонты понимания человеческого опыта. + """, + languageCode: .russian + ) + + // MARK: - Mixed Language Content + + static let mixedLanguagePost = TestContent( + title: "Mixed Language Post", + content: """ + The Mediterranean Diet: Una Guía Completa + + The Mediterranean diet has been recognized by UNESCO as an Intangible Cultural Heritage + of Humanity. Esta dieta tradicional se basa en el consumo de aceite de oliva, frutas, + verduras, legumbres, y pescado. + + Los beneficios para la salud son numerosos: reduced risk of heart disease, mejor + control del peso, y longevidad aumentada. Studies have shown that people who follow + this diet tend to live longer and healthier lives. + """, + languageCode: .english + ) + + // MARK: - Error Handling Test Cases + + static let emptyContent = TestContent( + title: "Empty Content", + content: "", + languageCode: .english + ) + + static let veryLongContent = TestContent( + title: "Very Long Content", + content: String(repeating: """ + Quantum computing represents a paradigm shift in computational technology. Unlike classical + computers that process information using bits (0s and 1s), quantum computers leverage the + principles of quantum mechanics to operate with qubits. These qubits can exist in multiple + states simultaneously through superposition, enabling parallel processing of vast amounts + of data. The phenomenon of quantum entanglement further enhances computational capabilities + by allowing qubits to be correlated in ways that classical bits cannot achieve. + + The implications of quantum computing extend across numerous fields. In cryptography, quantum + computers pose both a threat to current encryption methods and a promise for ultra-secure + quantum key distribution. Drug discovery and molecular modeling benefit from quantum simulation + of complex chemical interactions. Financial modeling, optimization problems, and artificial + intelligence are all domains poised for transformation through quantum algorithms. + + However, significant challenges remain before quantum computing becomes mainstream. Quantum + systems are extremely sensitive to environmental interference, requiring near-absolute-zero + temperatures and electromagnetic isolation. Quantum decoherence occurs when qubits lose their + quantum properties due to external disturbances, limiting the duration of quantum computations. + Error correction in quantum systems is fundamentally more complex than in classical computing, + requiring multiple physical qubits to encode a single logical qubit. + + Current quantum computers are in the NISQ era (Noisy Intermediate-Scale Quantum), characterized + by systems with 50-100 qubits that are prone to errors. Major technology companies and research + institutions are racing to achieve quantum advantage—the point where quantum computers can + solve practical problems faster than classical supercomputers. Google's quantum processor + achieved a milestone in 2019 by performing a specific calculation in 200 seconds that would + take the world's fastest supercomputer 10,000 years. + + """, count: 30) + "\n\nThis content continues for over 10,000 words to test handling of very long inputs.", + languageCode: .english + ) + + static let malformedHTML = TestContent( + title: "Malformed HTML", + content: """ +

Broken HTML Content

+

This paragraph is not closed properly +

This div has no closing tag +
    +
  • Item 1 +
  • Item 2
  • +
  • Item 3
  • +
+

Bold text with nested italics

+ + Missing closing bracket
+        <a href=Link with no closing tag + """, + languageCode: .english + ) + + static let emojiAndSpecialCharacters = TestContent( + title: "Emoji and Special Characters", + content: """ + 🌟 Welcome to the World of Unicode! 🌍 + + Emojis have become an integral part of digital communication 💬. From simple smileys 😊 + to complex sequences 👨‍👩‍👧‍👦, they convey emotions and ideas across language barriers. + + Special characters matter too: © ® ™ § ¶ † ‡ • ◦ ‣ ⁃ ⁎ ⁕ ❖ ※ + Mathematical symbols: ∑ ∏ √ ∞ ≈ ≠ ≤ ≥ ± × ÷ ∂ ∫ ∇ + Currency symbols: $ € £ ¥ ₹ ₽ ₩ ₪ ฿ ¢ + + Zero-width characters and combining marks: café vs café (different é construction) + Right-to-left marks: ‏עברית‏ العربية + Emoji variations: 👍 👍🏻 👍🏼 👍🏽 👍🏾 👍🏿 + + Uncommon Unicode: Ω ℃ ℉ № ℠ ™ ℮ ⅓ ⅔ ¼ ¾ ⅛ ⅜ ⅝ ⅞ + Box drawing: ┌─┬─┐ │ │ │ ├─┼─┤ │ │ │ └─┴─┘ + + This tests how the system handles diverse Unicode characters! 🎉✨🚀 + """, + languageCode: .english + ) + + // MARK: - Tag Data + + static let spanishSiteTags = [ + "recetas", + "cocina-española", + "gastronomía", + "comida-mediterránea", + "platos-tradicionales" + ] + + static let englishSiteTags = [ + "baking", + "bread-making", + "recipes", + "sourdough", + "homemade" + ] + + static let frenchSiteTags = [ + "cuisine", + "gastronomie-française", + "recettes", + "plats-traditionnels", + "art-culinaire" + ] + + static let japaneseSiteTags = [ + "日本料理", + "レシピ", + "料理", + "伝統", + "和食" + ] + + static let germanSiteTags = [ + "technologie", + "innovation", + "deutschland", + "automobil", + "elektromobilität" + ] + + static let mandarinSiteTags = [ + "文化", + "茶道", + "传统", + "生活方式", + "健康" + ] + + static let russianSiteTags = [ + "литература", + "культура", + "классика", + "искусство", + "философия" + ] +} diff --git a/Modules/Tests/WordPressIntelligenceTests/Helpers/TestHelperWordCountTests.swift b/Modules/Tests/WordPressIntelligenceTests/Helpers/TestHelperWordCountTests.swift new file mode 100644 index 000000000000..cdcc796e07fd --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/Helpers/TestHelperWordCountTests.swift @@ -0,0 +1,89 @@ +import Testing +import NaturalLanguage + +@Suite("Word Counting") +struct TestHelperWordCountTests { + + @Test("English word counting") + func englishWordCounting() { + let text = "The quick brown fox jumps over the lazy dog" + let count = TestHelpers.countWords(text, language: .english) + #expect(count == 9) + } + + @Test("Spanish word counting") + func spanishWordCounting() { + let text = "El rápido zorro marrón salta sobre el perro perezoso" + let count = TestHelpers.countWords(text, language: .spanish) + #expect(count == 9) + } + + @Test("Japanese word counting") + func japaneseWordCounting() { + // "I like Japanese food very much" - 5 meaningful word units + let text = "私は日本料理が大好きです" + let count = TestHelpers.countWords(text, language: .japanese) + + // NLTokenizer properly segments Japanese into word units + // Should recognize: 私/は/日本料理/が/大好き/です + #expect(count >= 5 && count <= 7, "Expected 5-7 words, got \(count)") + } + + @Test("Mandarin word counting") + func mandarinWordCounting() { + // "I like Chinese tea culture" - approximately 6-8 word units + let text = "我喜欢中国茶文化" + let count = TestHelpers.countWords(text, language: .simplifiedChinese) + + // NLTokenizer segments: 我/喜欢/中国/茶/文化 + #expect(count >= 4 && count <= 8, "Expected 4-8 words, got \(count)") + } + + @Test("French word counting with punctuation") + func frenchWordCountingWithPunctuation() { + let text = "Bonjour! Comment allez-vous aujourd'hui?" + let count = TestHelpers.countWords(text, language: .french) + // "allez-vous" is correctly tokenized as 2 words (verb + pronoun) + #expect(count == 5) + } + + @Test("Empty text") + func emptyText() { + let count = TestHelpers.countWords("", language: .english) + #expect(count == 0) + } + + @Test("Single word") + func singleWord() { + let count = TestHelpers.countWords("Hello", language: .english) + #expect(count == 1) + } + + @Test("Text with extra whitespace") + func textWithWhitespace() { + let text = " Hello world " + let count = TestHelpers.countWords(text, language: .english) + #expect(count == 2) + } + + @Test("Mixed English and numbers") + func mixedContent() { + let text = "There are 3 apples and 5 oranges" + let count = TestHelpers.countWords(text, language: .english) + #expect(count == 7) + } + + @Test("German word counting with compounds") + func germanWordCounting() { + let text = "Die deutsche Automobilindustrie ist sehr wichtig" + let count = TestHelpers.countWords(text, language: .german) + #expect(count == 6) + } + + @Test("Russian word counting with Cyrillic") + func russianWordCounting() { + let text = "Русская литература очень богата и интересна" + let count = TestHelpers.countWords(text, language: .russian) + #expect(count == 6) + } +} diff --git a/Modules/Tests/WordPressIntelligenceTests/Helpers/TestHelpers.swift b/Modules/Tests/WordPressIntelligenceTests/Helpers/TestHelpers.swift new file mode 100644 index 000000000000..c4f4cee2995b --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/Helpers/TestHelpers.swift @@ -0,0 +1,617 @@ +import Foundation +import NaturalLanguage +import Testing +@testable import WordPressIntelligence + +/// Helper utilities for formatting intelligence test output. +enum TestHelpers { + + // MARK: - Tag Suggestions + + static func printTagResults( + _ title: String, + tags: [String] + ) { + printSectionHeader(title) + + print("📑 Generated \(tags.count) tags:") + print() + for (i, tag) in tags.enumerated() { + print(" \(i + 1). \(tag)") + } + + printSectionFooter() + } + + // MARK: - Summaries + + static func printSummaryResults( + _ title: String, + summary: String + ) { + printSectionHeader(title) + + let wordCount = summary.split(separator: " ").count + let charCount = summary.count + print("📊 Metrics: \(wordCount) words • \(charCount) characters") + print() + print("📝 Summary:") + print() + print(summary.wrapped(width: 80)) + + printSectionFooter() + } + + // MARK: - Excerpts + + static func printExcerptResults( + _ title: String, + excerpts: [String], + targetLength: String, + style: String, + expectedLanguage: NLLanguage, + duration: Duration + ) { + printSectionHeader(title) + + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + print("📝 Generated \(excerpts.count) variations (language: \(expectedLanguage.rawValue), time: \(String(format: "%.3f", durationSeconds))s)") + print() + + let boxWidth = 80 + + for (i, excerpt) in excerpts.enumerated() { + let wordCount = countWords(excerpt, language: expectedLanguage) + + let detectedLanguage = detectLanguage(excerpt) + let languageMatch = detectedLanguage == expectedLanguage + let languageIndicator = languageMatch ? "✓" : "✗" + let languageInfo = "\(languageIndicator) \(detectedLanguage?.rawValue ?? "unknown")" + + // Fixed-width header + let header = "Variation \(i + 1) (\(wordCount) words, \(languageInfo))" + let headerVisualWidth = visualLength(header) + let headerPadding = max(0, boxWidth - headerVisualWidth - 4) // -4 for "┌─ " + "┐" + print("┌─ \(header) " + String(repeating: "─", count: headerPadding) + "┐") + + // Content with consistent width and padding + // Use slightly smaller width for wrapping to avoid edge cases with emoji rendering + let innerWidth = boxWidth - 4 // -4 for "│ " and " │" + let wrapWidth = innerWidth - 2 // Be conservative to avoid overflow + + for line in excerpt.wrapped(width: wrapWidth).split(separator: "\n") { + let lineStr = String(line) + let lineVisualWidth = visualLength(lineStr) + let linePadding = max(0, innerWidth - lineVisualWidth) + print("│ \(lineStr)\(String(repeating: " ", count: linePadding)) │") + } + + // Fixed-width footer + print("└" + String(repeating: "─", count: boxWidth - 2) + "┘") + print() + } + } + + static func printExcerptResults( + parameters: ExcerptTestCaseParameters, + excerpts: [String], + duration: Duration + ) { + printExcerptResults( + parameters.testDescription, + excerpts: excerpts, + targetLength: parameters.length.promptModifier, + style: parameters.style.displayName, + expectedLanguage: parameters.data.languageCode, + duration: duration + ) + } + + @available(iOS 26, *) + static func printExcerptResults( + _ title: String, + excerpts: [String], + generator: PostExcerptGenerator, + expectedLanguage: NLLanguage, + duration: Duration + ) { + printExcerptResults( + title, + excerpts: excerpts, + targetLength: generator.length.promptModifier, + style: generator.style.displayName, + expectedLanguage: expectedLanguage, + duration: duration + ) + } + + // MARK: - Comparison Tables + + static func printComparisonTable( + _ title: String, + headers: [String], + rows: [[String]] + ) { + printSectionHeader(title) + + // Calculate column widths + var widths = headers.map { $0.count } + for row in rows { + for (i, cell) in row.enumerated() where i < widths.count { + widths[i] = max(widths[i], cell.count) + } + } + + // Print header + print("┌", terminator: "") + for (i, width) in widths.enumerated() { + print(String(repeating: "─", count: width + 2), terminator: "") + print(i < widths.count - 1 ? "┬" : "┐\n", terminator: "") + } + + print("│", terminator: "") + for (i, header) in headers.enumerated() { + print(" \(header.padding(toLength: widths[i], withPad: " ", startingAt: 0)) ", terminator: "") + print(i < headers.count - 1 ? "│" : "│\n", terminator: "") + } + + // Print separator + print("├", terminator: "") + for (i, width) in widths.enumerated() { + print(String(repeating: "─", count: width + 2), terminator: "") + print(i < widths.count - 1 ? "┼" : "┤\n", terminator: "") + } + + // Print rows + for row in rows { + print("│", terminator: "") + for (i, cell) in row.enumerated() where i < widths.count { + print(" \(cell.padding(toLength: widths[i], withPad: " ", startingAt: 0)) ", terminator: "") + print(i < row.count - 1 ? "│" : "│\n", terminator: "") + } + } + + // Print footer + print("└", terminator: "") + for (i, width) in widths.enumerated() { + print(String(repeating: "─", count: width + 2), terminator: "") + print(i < widths.count - 1 ? "┴" : "┘\n", terminator: "") + } + + printSectionFooter() + } + + // MARK: - Utilities + + private static func printSectionHeader(_ title: String) { + let boxWidth = 80 + let border = String(repeating: "═", count: boxWidth) + + print() + print("╔\(border)╗") + + // Extract language from title (first word) + let language = title.split(separator: " ").first.map(String.init) + let flag = language.map { languageFlag(for: $0) } ?? "" + let displayTitle = flag.isEmpty ? title : "\(flag) \(title)" + + // Calculate padding (accounting for emoji visual width) + // Flag emojis are 2 unicode scalars but display as ~2 visual spaces + let visualWidth = visualLength(displayTitle) + let paddingNeeded = boxWidth - visualWidth - 2 // -2 for "║ " and " ║" + let paddedTitle = displayTitle + String(repeating: " ", count: max(0, paddingNeeded)) + + print("║ \(paddedTitle) ║") + print("╠\(border)╣") + print() + } + + private static func printSectionFooter() { + let boxWidth = 80 + let border = String(repeating: "═", count: boxWidth) + print("╚\(border)╝") + print() + } + + // MARK: - Performance Measurement + + /// Measures the execution time of an async throwing operation. + static func measure( + _ operation: () async throws -> T + ) async throws -> (result: T, duration: Duration) { + let clock = ContinuousClock() + let start = clock.now + let result = try await operation() + let duration = clock.now - start + return (result, duration) + } + + // MARK: - Language Detection + + /// Detects the dominant language in the given text. + static func detectLanguage(_ text: String) -> NLLanguage? { + let recognizer = NLLanguageRecognizer() + recognizer.processString(text) + return recognizer.dominantLanguage + } + + /// Verifies that the text matches the expected language. + static func verifyLanguage(_ text: String, matches expected: NLLanguage) -> Bool { + detectLanguage(text) == expected + } + + /// Verifies that all excerpts match the expected language. + static func verifyExcerptsLanguage(_ excerpts: [String], expectedLanguage: NLLanguage) { + for (index, excerpt) in excerpts.enumerated() { + let detectedLanguage = detectLanguage(excerpt) + + #expect( + detectedLanguage == expectedLanguage, + "Excerpt \(index + 1) language mismatch: expected \(expectedLanguage.rawValue), got \(detectedLanguage?.rawValue ?? "unknown")\nExcerpt: \(excerpt)" + ) + } + } + + // MARK: - Word Counting + + /// Counts words in text, properly handling all languages including CJK. + /// + /// Uses NLTokenizer for accurate word segmentation across different scripts: + /// - Space-separated languages (English, Spanish, French, etc.) + /// - CJK languages without spaces (Japanese, Mandarin) + /// - Mixed scripts + static func countWords(_ text: String, language: NLLanguage? = nil) -> Int { + guard !text.isEmpty else { return 0 } + + let tokenizer = NLTokenizer(unit: .word) + tokenizer.string = text + + if let language = language { + tokenizer.setLanguage(language) + } + + var wordCount = 0 + tokenizer.enumerateTokens(in: text.startIndex.. 150% of max (test fails) + /// - **Warning**: Word count slightly outside target range but within lenient bounds (test passes with warning) + /// - **Pass**: Word count within target range + /// + /// This approach accommodates LLM variance and language differences while catching egregious violations. + static func verifyExcerptsWordCount( + _ excerpts: [String], + wordRange: ClosedRange, + language: NLLanguage? = nil + ) { + // Lenient bounds: allow 30% below min, 50% above max before failing + let strictMinWords = Int(Double(wordRange.lowerBound) * 0.7) // 70% of minimum + let strictMaxWords = Int(Double(wordRange.upperBound) * 1.5) // 150% of maximum + + for (index, excerpt) in excerpts.enumerated() { + let wordCount = countWords(excerpt, language: language) + + // Check minimum word count + if wordCount < strictMinWords { + // FAIL: Way too short (< 70% of target minimum) + #expect( + wordCount >= strictMinWords, + "Excerpt \(index + 1) CRITICALLY SHORT: \(wordCount) words (target: \(wordRange.lowerBound)-\(wordRange.upperBound), minimum acceptable: \(strictMinWords))\nExcerpt: \(excerpt)" + ) + } else if wordCount < wordRange.lowerBound { + // WARNING: Below target but within acceptable bounds + Issue.record( + Comment(rawValue: "⚠️ Excerpt \(index + 1) slightly short: \(wordCount) words (target minimum: \(wordRange.lowerBound), acceptable minimum: \(strictMinWords))\nExcerpt: \(excerpt)") + ) + } + + // Check maximum word count + if wordCount > strictMaxWords { + // FAIL: Way too long (> 150% of target maximum) + #expect( + wordCount <= strictMaxWords, + "Excerpt \(index + 1) CRITICALLY LONG: \(wordCount) words (target: \(wordRange.lowerBound)-\(wordRange.upperBound), maximum acceptable: \(strictMaxWords))\nExcerpt: \(excerpt)" + ) + } else if wordCount > wordRange.upperBound { + // WARNING: Above target but within acceptable bounds + Issue.record( + Comment(rawValue: "⚠️ Excerpt \(index + 1) slightly long: \(wordCount) words (target maximum: \(wordRange.upperBound), acceptable maximum: \(strictMaxWords))\nExcerpt: \(excerpt)") + ) + } + } + } + + // MARK: - Excerpt Diversity + + /// Calculates Levenshtein distance between two strings. + /// + /// Levenshtein distance is the minimum number of single-character edits + /// (insertions, deletions, or substitutions) required to change one string into another. + /// + /// - Returns: The edit distance between the two strings + static func levenshteinDistance(_ s1: String, _ s2: String) -> Int { + let s1Array = Array(s1) + let s2Array = Array(s2) + let m = s1Array.count + let n = s2Array.count + + guard m > 0 else { return n } + guard n > 0 else { return m } + + var matrix = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1) + + for i in 0...m { + matrix[i][0] = i + } + + for j in 0...n { + matrix[0][j] = j + } + + for i in 1...m { + for j in 1...n { + let cost = s1Array[i - 1] == s2Array[j - 1] ? 0 : 1 + matrix[i][j] = min( + matrix[i - 1][j] + 1, // deletion + matrix[i][j - 1] + 1, // insertion + matrix[i - 1][j - 1] + cost // substitution + ) + } + } + + return matrix[m][n] + } + + /// Calculates similarity ratio between two strings (0.0 to 1.0). + /// + /// - Returns: 1.0 for identical strings, 0.0 for completely different strings + static func similarityRatio(_ s1: String, _ s2: String) -> Double { + let maxLength = max(s1.count, s2.count) + guard maxLength > 0 else { return 1.0 } + + let distance = levenshteinDistance(s1, s2) + return 1.0 - Double(distance) / Double(maxLength) + } + + /// Verifies that all excerpts are sufficiently different from each other. + /// + /// Checks all pairs of excerpts to ensure they have meaningful variation. + /// Uses Levenshtein distance to measure similarity. + /// + /// - Parameters: + /// - excerpts: The excerpts to compare + /// - minDifferenceRatio: Minimum required difference (0.0-1.0). Default 0.15 means excerpts must be at least 15% different + static func verifyExcerptsDiversity( + _ excerpts: [String], + minDifferenceRatio: Double = 0.15 + ) { + guard excerpts.count >= 2 else { return } + + for i in 0..= minDifferenceRatio, + """ + Excerpts \(i + 1) and \(j + 1) are too similar (\(String(format: "%.1f%%", similarity * 100)) similar, \ + need at least \(String(format: "%.1f%%", minDifferenceRatio * 100)) difference) + + Excerpt \(i + 1): \(excerpts[i]) + + Excerpt \(j + 1): \(excerpts[j]) + """ + ) + } + } + } + + private static func languageFlag(for language: String) -> String { + switch language.lowercased() { + case "spanish": return "🇪🇸" + case "english": return "🇬🇧" + case "french": return "🇫🇷" + case "japanese": return "🇯🇵" + case "german": return "🇩🇪" + case "mandarin": return "🇨🇳" + case "hindi": return "🇮🇳" + case "russian": return "🇷🇺" + case "mixed": return "🌐" + case "dominant": return "🌐" + default: return "🌍" + } + } + + /// Calculate visual length of string, accounting for emoji width. + /// Different emojis have different visual widths in terminals. + static func visualLength(_ string: String) -> Int { + var length = 0 + var skipNext = false + + for scalar in string.unicodeScalars { + if skipNext { + skipNext = false + continue + } + + // Regional indicator symbols (flag emojis) - they come in pairs + if (0x1F1E6...0x1F1FF).contains(scalar.value) { + length += 2 + skipNext = true // Skip the second regional indicator + } else if scalar.properties.isEmoji || scalar.properties.isEmojiPresentation { + // Simple emojis like ✓ often render as 1-2 spaces + // Being conservative: most emojis take 2 spaces + length += 2 + } else { + length += 1 + } + } + return length + } + + // MARK: - Tag Validation + + /// Verifies that all tags match the expected language. + static func verifyTagsLanguage(_ tags: [String], expectedLanguage: NLLanguage) { + for (index, tag) in tags.enumerated() { + let detectedLanguage = detectLanguage(tag) + + #expect( + detectedLanguage == expectedLanguage, + "Tag \(index + 1) language mismatch: expected \(expectedLanguage.rawValue), got \(detectedLanguage?.rawValue ?? "unknown")\nTag: \(tag)" + ) + } + } + + /// Verifies that tags follow the same format as site tags. + /// Checks for patterns like: lowercase-with-hyphens, lowercase_with_underscores, Title Case, etc. + static func verifyTagsFormat(_ tags: [String], siteTags: [String]) { + guard !siteTags.isEmpty else { return } + + // Detect format pattern from site tags + let hasHyphens = siteTags.contains { $0.contains("-") } + let hasUnderscores = siteTags.contains { $0.contains("_") } + let hasSpaces = siteTags.contains { $0.contains(" ") } + let hasUppercase = siteTags.contains { $0.rangeOfCharacter(from: .uppercaseLetters) != nil } + + for tag in tags { + // Skip format check for non-Latin scripts (Japanese, Chinese, etc.) + let isLatinScript = tag.rangeOfCharacter(from: CharacterSet.letters) != nil + guard isLatinScript else { continue } + + // Record warnings for format inconsistencies (not failures) + // LLM may reasonably vary formatting based on context + if hasHyphens && !tag.contains("-") && tag.contains(" ") { + Issue.record( + Comment(rawValue: "⚠️ Site tags use hyphens, but tag '\(tag)' uses spaces") + ) + } else if hasUnderscores && !tag.contains("_") && tag.contains(" ") { + Issue.record( + Comment(rawValue: "⚠️ Site tags use underscores, but tag '\(tag)' uses spaces") + ) + } + + // Check case consistency + let tagHasUppercase = tag.rangeOfCharacter(from: .uppercaseLetters) != nil + if !hasUppercase && tagHasUppercase { + Issue.record( + Comment(rawValue: "⚠️ Site tags are lowercase, but tag '\(tag)' has uppercase") + ) + } + } + } + + /// Print formatted tag results with context + static func printTagResults( + parameters: TagTestCaseParameters, + tags: [String], + duration: Duration + ) { + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + + printSectionHeader(parameters.testDescription) + + print("⏱️ Generated \(tags.count) tags in \(String(format: "%.3f", durationSeconds))s") + + if !parameters.siteTags.isEmpty { + print("🏷️ Site tags context: \(parameters.siteTags.count) tags") + } + print() + + for (i, tag) in tags.enumerated() { + let detectedLanguage = detectLanguage(tag) + let languageInfo = detectedLanguage.map { " [\($0.rawValue)]" } ?? "" + print(" \(i + 1). \(tag)\(languageInfo)") + } + + printSectionFooter() + } + + // MARK: - Summary Validation + + /// Verifies that a summary is in the expected language. + /// Uses NLLanguageRecognizer to detect the language of the summary. + static func verifySummaryLanguage(_ summary: String, expectedLanguage: NLLanguage) { + let detectedLanguage = detectLanguage(summary) + + #expect(detectedLanguage == expectedLanguage, + "Summary language mismatch: expected \(expectedLanguage.rawValue), got \(detectedLanguage?.rawValue ?? "unknown")") + } + + /// Prints formatted summary test results to console. + static func printSummaryResults( + parameters: SummaryTestCaseParameters, + summary: String, + duration: Duration + ) { + printSectionHeader("") + + // Test info + print("Test: \(parameters.testDescription)") + print("Language: \(parameters.data.languageCode.rawValue)") + + // Duration + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + print("Duration: \(String(format: "%.2f", durationSeconds))s") + + // Word count comparison + let summaryWordCount = summary.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }.count + let originalWordCount = parameters.data.content.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }.count + let compressionRatio = Double(summaryWordCount) / Double(originalWordCount) * 100.0 + print("Compression: \(originalWordCount) → \(summaryWordCount) words (\(String(format: "%.1f", compressionRatio))%)") + + print("") + + // Summary content + print("Summary:") + print(summary.wrapped(width: 80).split(separator: "\n").map { " \($0)" }.joined(separator: "\n")) + + printSectionFooter() + } +} + +// MARK: - String Extensions + +private extension String { + /// Wraps text to specified width while preserving words. + /// Accounts for emoji visual width in terminals. + func wrapped(width: Int) -> String { + var result = "" + var currentLine = "" + var currentWidth = 0 + + for word in self.split(separator: " ") { + let wordWidth = TestHelpers.visualLength(String(word)) + + if currentWidth + wordWidth + 1 > width { + if !result.isEmpty { + result += "\n" + } + result += currentLine.trimmingCharacters(in: .whitespaces) + currentLine = String(word) + " " + currentWidth = wordWidth + 1 + } else { + currentLine += word + " " + currentWidth += wordWidth + 1 + } + } + + if !currentLine.isEmpty { + if !result.isEmpty { + result += "\n" + } + result += currentLine.trimmingCharacters(in: .whitespaces) + } + + return result + } +} diff --git a/Modules/Tests/WordPressIntelligenceTests/Makefile b/Modules/Tests/WordPressIntelligenceTests/Makefile new file mode 100644 index 000000000000..2ed44bedc498 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/Makefile @@ -0,0 +1,70 @@ +.PHONY: help eval eval-quick eval-tags eval-summary open clean + +# ============================================================================= +# Evaluation Targets - Run full evaluations with Claude scoring +# ============================================================================= + +# Default test types (all of them) +TESTS ?= excerpts tags summary + +eval: + @for test_type in $(TESTS); do \ + echo "Running evaluation for $$test_type..."; \ + ./lib/evaluate.py --test-type $$test_type || exit 1; \ + done + +eval-quick: + @./lib/evaluate.py --only-testing "PostExcerptGeneratorTests/excerptGenerationEnglish(parameters:)" + +eval-tags: + @./lib/evaluate.py --test-type tags + +eval-summary: + @./lib/evaluate.py --test-type summary + +# ============================================================================= +# Utilities +# ============================================================================= + +open: + @LATEST=$$(ls -t "$(TMPDIR)WordPressIntelligence-Tests/*/evaluation-report.html" 2>/dev/null | head -1); \ + if [ -n "$$LATEST" ]; then \ + echo "Opening: $$LATEST"; \ + open "$$LATEST"; \ + else \ + echo "No report found. Run 'make eval' or 'make rebuild' first."; \ + fi + +clean: + @echo "Cleaning generated files..." + @rm -rf "$(TMPDIR)WordPressIntelligence-Tests" + @echo "✓ Cleaned" + +# ============================================================================= +# Help +# ============================================================================= + +help: + @echo "┌─────────────────────────────────────────────────────────┐" + @echo "│ WordPressIntelligence Evaluation Test Suite │" + @echo "└─────────────────────────────────────────────────────────┘" + @echo "" + @echo "━━━ Evaluation (Run tests + Claude scoring) ━━━" + @echo "" + @echo " make eval Run all test types (excerpts, tags, summary)" + @echo " make eval TESTS=\"excerpts\" Run only excerpt tests" + @echo " make eval TESTS=\"excerpts tags\" Run excerpt and tag tests" + @echo " make eval-quick Run English excerpt tests only" + @echo " make eval-tags Run tag suggestion tests" + @echo " make eval-summary Run post summary tests" + @echo "" + @echo "━━━ Utilities ━━━" + @echo "" + @echo " make open Open latest HTML report" + @echo " make clean Clean all generated files" + @echo " make help Show this help" + @echo "" + @echo "For evaluation CLI options: ./lib/evaluate.py --help" + @echo "" + +.DEFAULT_GOAL := help diff --git a/Modules/Tests/WordPressIntelligenceTests/PostExcerptGeneratorTests.swift b/Modules/Tests/WordPressIntelligenceTests/PostExcerptGeneratorTests.swift new file mode 100644 index 000000000000..3a93858386e9 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/PostExcerptGeneratorTests.swift @@ -0,0 +1,310 @@ +import Testing +import Foundation +import FoundationModels +import NaturalLanguage +@testable import WordPressIntelligence + +@Suite(.serialized) +struct PostExcerptGeneratorTests { + // MARK: - Standard Test Cases + + @available(iOS 26, *) + @Test(arguments: ExcerptTestCaseParameters.englishCases) + func excerptGenerationEnglish(parameters: ExcerptTestCaseParameters) async throws { + _ = try await runExcerptTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test(arguments: ExcerptTestCaseParameters.nonEnglishCases) + func excerptGenerationNonEnglish(parameters: ExcerptTestCaseParameters) async throws { + _ = try await runExcerptTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("HTML content") + func htmlContent() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.englishPostWithHTML, + length: .medium, + style: .engaging + ) + _ = try await runExcerptTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Very short content") + func veryShortContent() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.veryShortEnglishContent, + length: .short, + style: .engaging + ) + _ = try await runExcerptTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Unsupported language: Hindi") + func unsupportedLanguages() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.hindiPost, + length: .short, + style: .conversational + ) + let generator = PostExcerptGenerator(length: parameters.length, style: parameters.style) + + do { + _ = try await generator.generate(for: parameters.data.content) + Issue.record("Expected unsupportedLanguageOrLocale error but no error was thrown") + } catch LanguageModelSession.GenerationError.unsupportedLanguageOrLocale { + return + } catch { + Issue.record("Expected unsupportedLanguageOrLocale error but got: \(error)") + } + } + + // MARK: - Error Handling Tests + + @available(iOS 26, *) + @Test("Empty content") + func emptyContent() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.emptyContent, + length: .short, + style: .engaging + ) + let generator = PostExcerptGenerator(length: parameters.length, style: parameters.style) + + // Empty content should either throw an error or return empty excerpts + do { + let excerpts = try await generator.generate(for: parameters.data.content) + // If it doesn't throw, verify it returns empty or sensible default + #expect(excerpts.isEmpty || excerpts.allSatisfy { $0.isEmpty }) + } catch { + // Expected to throw for empty content - this is acceptable behavior + return + } + } + + @available(iOS 26, *) + @Test("Very long content (>10K words)") + func veryLongContent() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.veryLongContent, + length: .medium, + style: .professional + ) + + // Should handle gracefully - either generate excerpts or throw appropriate error + // Allow longer processing time for very long content + do { + let (excerpts, _) = try await runExcerptTest( + parameters: parameters, + maxDuration: .seconds(30) + ) + + // If successful, verify excerpts are reasonable despite long input + #expect(!excerpts.isEmpty) + + // Word count should still be within bounds + for excerpt in excerpts { + let wordCount = TestHelpers.countWords(excerpt, language: .english) + #expect(parameters.length.wordRange.contains(wordCount), + "Word count \(wordCount) out of range for long content") + } + } catch { + // May throw due to content length limits - this is acceptable + return + } + } + + @available(iOS 26, *) + @Test("Performance benchmark") + func performanceBenchmark() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.englishTechPost, + length: .medium, + style: .engaging + ) + + // Standard content should complete within 5 seconds + let (excerpts, duration) = try await runExcerptTest( + parameters: parameters, + maxDuration: .seconds(5) + ) + + // Verify generation was successful + #expect(!excerpts.isEmpty) + + // Log performance for tracking + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + print("Performance: Generated \(excerpts.count) excerpts in \(String(format: "%.2f", durationSeconds))s") + } + + @available(iOS 26, *) + @Test("Malformed HTML") + func malformedHTML() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.malformedHTML, + length: .short, + style: .conversational + ) + + // Should handle malformed HTML gracefully (extract text or clean it up) + let (excerpts, _) = try await runExcerptTest(parameters: parameters) + + // Verify excerpts don't contain HTML tags + for excerpt in excerpts { + #expect(!excerpt.contains("<") && !excerpt.contains(">"), + "Excerpt should not contain HTML tags: \(excerpt)") + } + + // Verify excerpts are not empty (HTML was successfully processed) + #expect(!excerpts.isEmpty) + #expect(excerpts.allSatisfy { !$0.trimmingCharacters(in: .whitespaces).isEmpty }) + } + + @available(iOS 26, *) + @Test("Content with emojis and special Unicode characters") + func emojiAndSpecialCharacters() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.emojiAndSpecialCharacters, + length: .medium, + style: .engaging + ) + + let (excerpts, _) = try await runExcerptTest(parameters: parameters) + + // Verify excerpts are generated successfully + #expect(!excerpts.isEmpty) + + // Verify excerpts handle Unicode correctly (no corruption or truncation) + for excerpt in excerpts { + // Should not be empty after Unicode processing + #expect(!excerpt.trimmingCharacters(in: .whitespaces).isEmpty) + + // Check that excerpts preserve some Unicode content or handle it gracefully + // (may or may not include emojis depending on generation logic) + let hasContent = excerpt.count > 10 + #expect(hasContent, "Excerpt should have meaningful content despite Unicode") + } + } + + @available(iOS 26, *) + @Test("Mixed language content") + func mixedLanguageContent() async throws { + let parameters = ExcerptTestCaseParameters( + data: TestData.mixedLanguagePost, + length: .medium, + style: .professional + ) + + // Skip language check since content is intentionally mixed + let (excerpts, _) = try await runExcerptTest( + parameters: parameters, + skip: .skipLanguageCheck + ) + + // Should generate excerpts for mixed language content + #expect(!excerpts.isEmpty) + + // Verify excerpts have reasonable word counts + for excerpt in excerpts { + let wordCount = TestHelpers.countWords(excerpt, language: .english) + #expect(parameters.length.wordRange.contains(wordCount), + "Mixed language excerpt word count \(wordCount) out of range") + } + } + + // MARK: - Helper Types + + /// Validation options for excerpt generation tests + struct ValidationOptions: OptionSet { + let rawValue: Int + + static let skipLanguageCheck = ValidationOptions(rawValue: 1 << 0) + static let skipWordCountCheck = ValidationOptions(rawValue: 1 << 1) + static let skipDiversityCheck = ValidationOptions(rawValue: 1 << 2) + + static let all: ValidationOptions = [] + static let skipAll: ValidationOptions = [.skipLanguageCheck, .skipWordCountCheck, .skipDiversityCheck] + } + + // MARK: - Helper Methods + + /// Reusable test helper that runs excerpt generation and performs standard validations + @available(iOS 26, *) + private func runExcerptTest( + parameters: ExcerptTestCaseParameters, + skip: ValidationOptions = [], + maxDuration: Duration? = .seconds(10) + ) async throws -> ([String], Duration) { + let generator = PostExcerptGenerator(length: parameters.length, style: parameters.style) + + let (excerpts, duration) = try await TestHelpers.measure { + try await generator.generate(for: parameters.data.content) + } + + // Performance benchmark + if let maxDuration = maxDuration { + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + let maxSeconds = Double(maxDuration.components.seconds) + Double(maxDuration.components.attoseconds) / 1e18 + #expect( + duration <= maxDuration, + "Generation took too long: \(String(format: "%.2f", durationSeconds))s (max: \(String(format: "%.2f", maxSeconds))s)" + ) + } + + if !skip.contains(.skipLanguageCheck) { + TestHelpers.verifyExcerptsLanguage(excerpts, expectedLanguage: parameters.data.languageCode) + } + + if !skip.contains(.skipWordCountCheck) { + TestHelpers.verifyExcerptsWordCount( + excerpts, + wordRange: parameters.length.wordRange, + language: parameters.data.languageCode + ) + } + + if !skip.contains(.skipDiversityCheck) { + TestHelpers.verifyExcerptsDiversity(excerpts) + } + + try? ExcerptTestOutput( + parameters: parameters, + excerpts: excerpts, + duration: duration + ).recordAndPrint(parameters: parameters, duration: duration) + + return (excerpts, duration) + } +} + +struct ExcerptTestCaseParameters: CustomTestStringConvertible { + let data: TestContent + let length: ContentLength + let style: WritingStyle + + var testDescription: String { + "\(data.title) - \(length.displayName), \(style.displayName)" + } + + typealias Data = TestData + + static let englishCases: [ExcerptTestCaseParameters] = [ + ExcerptTestCaseParameters(data: Data.englishTechPost, length: .short, style: .witty), + ExcerptTestCaseParameters(data: Data.englishAcademicPost, length: .medium, style: .formal), + ExcerptTestCaseParameters(data: Data.englishStoryPost, length: .long, style: .engaging), + ] + + static let nonEnglishCases: [ExcerptTestCaseParameters] = [ + ExcerptTestCaseParameters(data: Data.spanishPost, length: .medium, style: .professional), + ExcerptTestCaseParameters(data: Data.frenchPost, length: .short, style: .engaging), + ExcerptTestCaseParameters(data: Data.japanesePost, length: .medium, style: .conversational), + ExcerptTestCaseParameters(data: Data.germanTechPost, length: .short, style: .professional), + ExcerptTestCaseParameters(data: Data.mandarinPost, length: .medium, style: .engaging), + ExcerptTestCaseParameters(data: Data.russianPost, length: .medium, style: .formal), + ] + + static let allCases: [ExcerptTestCaseParameters] = englishCases + nonEnglishCases +} diff --git a/Modules/Tests/WordPressIntelligenceTests/PostSummaryGeneratorTests.swift b/Modules/Tests/WordPressIntelligenceTests/PostSummaryGeneratorTests.swift new file mode 100644 index 000000000000..3545051bae54 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/PostSummaryGeneratorTests.swift @@ -0,0 +1,211 @@ +import Testing +import Foundation +import FoundationModels +import NaturalLanguage +@testable import WordPressIntelligence + +@Suite(.serialized) +struct PostSummaryGeneratorTests { + // MARK: - Standard Test Cases + + @available(iOS 26, *) + @Test(arguments: SummaryTestCaseParameters.allCases) + func postSummary(parameters: SummaryTestCaseParameters) async throws { + _ = try await runSummaryTest(parameters: parameters) + } + + // MARK: - Edge Case Tests + + @available(iOS 26, *) + @Test("HTML content") + func htmlContent() async throws { + let parameters = SummaryTestCaseParameters( + data: TestData.englishPostWithHTML + ) + _ = try await runSummaryTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Malformed HTML") + func malformedHTML() async throws { + let parameters = SummaryTestCaseParameters( + data: TestData.malformedHTML + ) + _ = try await runSummaryTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Very short content") + func veryShortContent() async throws { + let parameters = SummaryTestCaseParameters( + data: TestData.veryShortEnglishContent + ) + _ = try await runSummaryTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Very long content (>10K words)") + func veryLongContent() async throws { + let parameters = SummaryTestCaseParameters( + data: TestData.veryLongContent + ) + + do { + let (summary, _) = try await runSummaryTest( + parameters: parameters, + maxDuration: .seconds(30) + ) + #expect(!summary.isEmpty, "Should generate summary even for very long content") + } catch { + // May throw due to content length limits - this is acceptable + return + } + } + + @available(iOS 26, *) + @Test("Emoji and special Unicode characters") + func emojiAndSpecialCharacters() async throws { + let parameters = SummaryTestCaseParameters( + data: TestData.emojiAndSpecialCharacters + ) + _ = try await runSummaryTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Mixed language content") + func mixedLanguageContent() async throws { + let parameters = SummaryTestCaseParameters( + data: TestData.mixedLanguagePost + ) + + // Skip language check since content is intentionally mixed + _ = try await runSummaryTest( + parameters: parameters, + skip: .skipLanguageCheck + ) + } + + @available(iOS 26, *) + @Test("Performance benchmark") + func performanceBenchmark() async throws { + let parameters = SummaryTestCaseParameters( + data: TestData.englishTechPost + ) + + let (summary, duration) = try await runSummaryTest( + parameters: parameters, + maxDuration: .seconds(5) + ) + + #expect(!summary.isEmpty, "Should generate summary") + + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + print("Performance: Generated summary in \(String(format: "%.2f", durationSeconds))s") + } + + // MARK: - Helper Types + + /// Validation options for summary tests + struct ValidationOptions: OptionSet { + let rawValue: Int + + static let skipLanguageCheck = ValidationOptions(rawValue: 1 << 0) + static let skipLengthCheck = ValidationOptions(rawValue: 1 << 1) + static let skipContentCheck = ValidationOptions(rawValue: 1 << 2) + + static let all: ValidationOptions = [] + static let skipAll: ValidationOptions = [.skipLanguageCheck, .skipLengthCheck, .skipContentCheck] + } + + // MARK: - Helper Methods + + /// Reusable test helper that runs summary generation and performs standard validations + @available(iOS 26, *) + private func runSummaryTest( + parameters: SummaryTestCaseParameters, + skip: ValidationOptions = [], + maxDuration: Duration? = .seconds(10) + ) async throws -> (String, Duration) { + let generator = PostSummaryGenerator() + + let (summary, duration) = try await TestHelpers.measure { + try await generator.generate(content: parameters.data.content) + } + + // Performance validation + if let maxDuration = maxDuration { + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + let maxSeconds = Double(maxDuration.components.seconds) + Double(maxDuration.components.attoseconds) / 1e18 + #expect( + duration <= maxDuration, + "Generation took too long: \(String(format: "%.2f", durationSeconds))s (max: \(String(format: "%.2f", maxSeconds))s)" + ) + } + + // Validation: Non-empty + #expect(!summary.isEmpty, "Summary should not be empty") + + // Validation: Language match + if !skip.contains(.skipLanguageCheck) { + TestHelpers.verifySummaryLanguage(summary, expectedLanguage: parameters.data.languageCode) + } + + // Validation: Reasonable length (should be shorter than original) + if !skip.contains(.skipLengthCheck) { + let summaryWordCount = summary.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }.count + let originalWordCount = parameters.data.content.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }.count + #expect(summaryWordCount < originalWordCount, + "Summary (\(summaryWordCount) words) should be shorter than original (\(originalWordCount) words)") + } + + // Validation: Content relevance (should not be a generic response) + if !skip.contains(.skipContentCheck) { + let genericPhrases = ["this post", "this article", "the author"] + let hasSpecificContent = !genericPhrases.allSatisfy { summary.lowercased().contains($0.lowercased()) } + #expect(hasSpecificContent, "Summary should contain specific content, not just generic phrases") + } + + // Record structured output for evaluation + try? SummaryTestOutput( + parameters: parameters, + summary: summary, + duration: duration + ).recordAndPrint(parameters: parameters, duration: duration) + + return (summary, duration) + } +} + +struct SummaryTestCaseParameters: CustomTestStringConvertible { + let data: TestContent + + var testDescription: String { + data.title + } + + typealias Data = TestData + + static let allCases: [SummaryTestCaseParameters] = [ + // English + SummaryTestCaseParameters(data: Data.englishTechPost), + SummaryTestCaseParameters(data: Data.englishPost), + + // Spanish + SummaryTestCaseParameters(data: Data.spanishPost), + + // French + SummaryTestCaseParameters(data: Data.frenchPost), + + // Japanese + SummaryTestCaseParameters(data: Data.japanesePost), + + // German + SummaryTestCaseParameters(data: Data.germanTechPost), + + // Mandarin + SummaryTestCaseParameters(data: Data.mandarinPost), + + // Russian + SummaryTestCaseParameters(data: Data.russianPost), + ] +} diff --git a/Modules/Tests/WordPressIntelligenceTests/TagSuggestionGeneratorTests.swift b/Modules/Tests/WordPressIntelligenceTests/TagSuggestionGeneratorTests.swift new file mode 100644 index 000000000000..b6f145772021 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/TagSuggestionGeneratorTests.swift @@ -0,0 +1,259 @@ +import Testing +import Foundation +import FoundationModels +import NaturalLanguage +@testable import WordPressIntelligence + +@Suite(.serialized) +struct TagSuggestionGeneratorTests { + // MARK: - Standard Test Cases + + @available(iOS 26, *) + @Test(arguments: TagTestCaseParameters.allCases) + func tagSuggestion(parameters: TagTestCaseParameters) async throws { + _ = try await runTagTest(parameters: parameters) + } + + // MARK: - Edge Case Tests + + @available(iOS 26, *) + @Test("Exclude existing post tags") + func excludeExistingTags() async throws { + let parameters = TagTestCaseParameters( + data: TestData.spanishPost, + siteTags: TestData.spanishSiteTags, + postTags: ["recetas", "cocina"] + ) + let (tags, _) = try await runTagTest(parameters: parameters) + + #expect(!tags.contains { parameters.postTags.contains($0) }, + "Tags should not include existing post tags: \(parameters.postTags)") + } + + @available(iOS 26, *) + @Test("Empty site tags") + func emptySiteTags() async throws { + let parameters = TagTestCaseParameters( + data: TestData.spanishPost, + siteTags: [], + postTags: [] + ) + _ = try await runTagTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Very short content") + func veryShortContent() async throws { + let parameters = TagTestCaseParameters( + data: TestData.veryShortEnglishContent, + siteTags: TestData.englishSiteTags, + postTags: [] + ) + _ = try await runTagTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Very long content (>10K words)") + func veryLongContent() async throws { + let parameters = TagTestCaseParameters( + data: TestData.veryLongContent, + siteTags: TestData.englishSiteTags, + postTags: [] + ) + + do { + let (tags, _) = try await runTagTest( + parameters: parameters, + maxDuration: .seconds(30) + ) + #expect(!tags.isEmpty, "Should generate tags even for very long content") + } catch { + // May throw due to content length limits - this is acceptable + return + } + } + + @available(iOS 26, *) + @Test("HTML content") + func htmlContent() async throws { + let parameters = TagTestCaseParameters( + data: TestData.englishPostWithHTML, + siteTags: TestData.englishSiteTags, + postTags: [] + ) + _ = try await runTagTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Malformed HTML") + func malformedHTML() async throws { + let parameters = TagTestCaseParameters( + data: TestData.malformedHTML, + siteTags: TestData.englishSiteTags, + postTags: [] + ) + _ = try await runTagTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Emoji and special Unicode characters") + func emojiAndSpecialCharacters() async throws { + let parameters = TagTestCaseParameters( + data: TestData.emojiAndSpecialCharacters, + siteTags: TestData.englishSiteTags, + postTags: [] + ) + _ = try await runTagTest(parameters: parameters) + } + + @available(iOS 26, *) + @Test("Mixed language content") + func mixedLanguageContent() async throws { + let parameters = TagTestCaseParameters( + data: TestData.mixedLanguagePost, + siteTags: TestData.englishSiteTags, + postTags: [] + ) + + // Skip language check since content is intentionally mixed + _ = try await runTagTest( + parameters: parameters, + skip: .skipLanguageCheck + ) + } + + @available(iOS 26, *) + @Test("Performance benchmark") + func performanceBenchmark() async throws { + let parameters = TagTestCaseParameters( + data: TestData.englishTechPost, + siteTags: TestData.englishSiteTags, + postTags: [] + ) + + let (tags, duration) = try await runTagTest( + parameters: parameters, + maxDuration: .seconds(5) + ) + + #expect(!tags.isEmpty, "Should generate tags") + + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + print("Performance: Generated \(tags.count) tags in \(String(format: "%.2f", durationSeconds))s") + } + + // MARK: - Helper Types + + /// Validation options for tag suggestion tests + struct ValidationOptions: OptionSet { + let rawValue: Int + + static let skipLanguageCheck = ValidationOptions(rawValue: 1 << 0) + static let skipFormatCheck = ValidationOptions(rawValue: 1 << 1) + static let skipCountCheck = ValidationOptions(rawValue: 1 << 2) + + static let all: ValidationOptions = [] + static let skipAll: ValidationOptions = [.skipLanguageCheck, .skipFormatCheck, .skipCountCheck] + } + + // MARK: - Helper Methods + + /// Reusable test helper that runs tag generation and performs standard validations + @available(iOS 26, *) + private func runTagTest( + parameters: TagTestCaseParameters, + skip: ValidationOptions = [], + maxDuration: Duration? = .seconds(10) + ) async throws -> ([String], Duration) { + let generator = TagSuggestionGenerator() + + let (tags, duration) = try await TestHelpers.measure { + try await generator.generate( + post: parameters.data.content, + siteTags: parameters.siteTags, + postTags: parameters.postTags + ) + } + + // Performance validation + if let maxDuration = maxDuration { + let durationSeconds = Double(duration.components.seconds) + Double(duration.components.attoseconds) / 1e18 + let maxSeconds = Double(maxDuration.components.seconds) + Double(maxDuration.components.attoseconds) / 1e18 + #expect( + duration <= maxDuration, + "Generation took too long: \(String(format: "%.2f", durationSeconds))s (max: \(String(format: "%.2f", maxSeconds))s)" + ) + } + + // Validation: Language match + if !skip.contains(.skipLanguageCheck) { + TestHelpers.verifyTagsLanguage(tags, expectedLanguage: parameters.data.languageCode) + } + + // Validation: Format consistency + if !skip.contains(.skipFormatCheck) && !parameters.siteTags.isEmpty { + TestHelpers.verifyTagsFormat(tags, siteTags: parameters.siteTags) + } + + // Validation: Count (5-10 tags as per @Guide) + if !skip.contains(.skipCountCheck) { + #expect(tags.count >= 5 && tags.count <= 10, + "Expected 5-10 tags, got \(tags.count)") + } + + // Validation: Uniqueness + let uniqueTags = Set(tags) + #expect(uniqueTags.count == tags.count, + "Tags contain duplicates: \(tags)") + + // Validation: No existing post tags + let existingPostTags = Set(parameters.postTags) + #expect(!tags.contains { existingPostTags.contains($0) }, + "Tags should not include existing post tags") + + // Record structured output for evaluation + try? TagTestOutput( + parameters: parameters, + tags: tags, + duration: duration + ).recordAndPrint(parameters: parameters, duration: duration) + + return (tags, duration) + } +} + +struct TagTestCaseParameters: CustomTestStringConvertible { + let data: TestContent + let siteTags: [String] + let postTags: [String] + + var testDescription: String { + "\(data.title) - \(siteTags.count) site tags" + } + + typealias Data = TestData + + static let allCases: [TagTestCaseParameters] = [ + // English + TagTestCaseParameters(data: Data.englishTechPost, siteTags: Data.englishSiteTags, postTags: []), + TagTestCaseParameters(data: Data.englishPost, siteTags: Data.englishSiteTags, postTags: []), + + // Spanish + TagTestCaseParameters(data: Data.spanishPost, siteTags: Data.spanishSiteTags, postTags: []), + + // French + TagTestCaseParameters(data: Data.frenchPost, siteTags: Data.frenchSiteTags, postTags: []), + + // Japanese + TagTestCaseParameters(data: Data.japanesePost, siteTags: Data.japaneseSiteTags, postTags: []), + + // German + TagTestCaseParameters(data: Data.germanTechPost, siteTags: Data.germanSiteTags, postTags: []), + + // Mandarin + TagTestCaseParameters(data: Data.mandarinPost, siteTags: Data.mandarinSiteTags, postTags: []), + + // Russian + TagTestCaseParameters(data: Data.russianPost, siteTags: Data.russianSiteTags, postTags: []), + ] +} diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/claude_client.py b/Modules/Tests/WordPressIntelligenceTests/lib/claude_client.py new file mode 100644 index 000000000000..238a24b8ab51 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/claude_client.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Claude CLI wrapper for evaluation +""" + +import json +import re +import subprocess +from typing import Dict, Any, Optional + + +class ClaudeClient: + """Wrapper for Claude CLI subprocess calls""" + + def __init__(self, model: str = "sonnet"): + self.model = model + + def evaluate(self, prompt: str) -> Optional[Dict[str, Any]]: + """ + Call Claude CLI with evaluation prompt + + Args: + prompt: Evaluation prompt + + Returns: + Parsed JSON response or None if failed + """ + try: + result = subprocess.run( + ["claude", "--print", "--model", self.model], + input=prompt, + capture_output=True, + text=True, + timeout=60, + ) + + if result.returncode != 0: + print(f" ✗ Claude CLI error: {result.stderr}") + return None + + # Strip markdown code blocks + response = result.stdout.strip() + json_match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL) + if json_match: + response = json_match.group(1) + + # Parse JSON + try: + return json.loads(response) + except json.JSONDecodeError as e: + print(f" ✗ Invalid JSON response: {e}") + # Print first 200 chars for debugging + print(f" Response preview: {response[:200]}") + return None + + except subprocess.TimeoutExpired: + print(" ✗ Claude CLI timeout") + return None + except FileNotFoundError: + print(" ✗ Claude CLI not found (install: pip install claude-cli)") + return None + except Exception as e: + print(f" ✗ Unexpected error: {e}") + return None diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/config.py b/Modules/Tests/WordPressIntelligenceTests/lib/config.py new file mode 100644 index 000000000000..8d6c2790fdaf --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/config.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Configuration for evaluation pipeline +Test types, scoring weights, and thresholds +""" + +from dataclasses import dataclass +from typing import Dict, List, Tuple +from enum import Enum + + +class TestType(str, Enum): + EXCERPT = "excerpt-generation" + TAG = "tag-suggestion" + SUMMARY = "post-summary" + + +@dataclass +class EvaluationCriteria: + """Scoring criteria with weights""" + name: str + weight: float + critical_threshold: float = 0.0 # If > 0, score below this = auto-fail + description: str = "" + + +@dataclass +class TestTypeConfig: + """Configuration for a test type""" + test_class: str + marker_prefix: str + icon: str + name: str + criteria: List[EvaluationCriteria] + pass_threshold: float + needs_improvement_threshold: float + total_weight: float + diversity_check: bool = False + + +# Excerpt Generation Configuration +EXCERPT_CRITERIA = [ + EvaluationCriteria("languageMatch", 3.0, 8.0, "Is excerpt in the correct language?"), + EvaluationCriteria("grammar", 2.0, 6.0, "Grammatical correctness and fluency"), + EvaluationCriteria("relevance", 2.0, 0.0, "Captures main message of original?"), + EvaluationCriteria("hookQuality", 1.5, 0.0, "Entices reader to continue?"), + EvaluationCriteria("keyInformationPreservation", 1.5, 0.0, "Preserves critical facts?"), + EvaluationCriteria("standalone", 1.0, 0.0, "Makes sense without context?"), + EvaluationCriteria("engagement", 1.0, 0.0, "Maintains reader interest?"), +] + +# Tag Suggestion Configuration +TAG_CRITERIA = [ + EvaluationCriteria("relevance", 2.0, 6.0, "Tag accurately represents content?"), + EvaluationCriteria("languageMatch", 1.5, 7.0, "Tag in correct language?"), + EvaluationCriteria("formatConsistency", 1.5, 0.0, "Matches site tag formatting?"), + EvaluationCriteria("seoQuality", 1.0, 0.0, "Good for search/discovery?"), + EvaluationCriteria("uniqueness", 1.0, 0.0, "Not duplicating existing tags?"), + EvaluationCriteria("specificity", 1.0, 0.0, "Specific vs generic?"), +] + +# Summary Generation Configuration +SUMMARY_CRITERIA = [ + EvaluationCriteria("conciseness", 2.0, 6.0, "Is the summary concise and to the point?"), + EvaluationCriteria("accuracy", 2.0, 6.0, "Does it accurately represent the main points?"), + EvaluationCriteria("languageMatch", 1.5, 7.0, "Is summary in correct language?"), + EvaluationCriteria("clarity", 1.5, 0.0, "Is the summary clear and easy to understand?"), + EvaluationCriteria("completeness", 1.0, 0.0, "Captures all major topics/themes?"), + EvaluationCriteria("neutralTone", 1.0, 0.0, "Maintains objective, neutral tone?"), + EvaluationCriteria("coherence", 1.0, 0.0, "Flows logically and makes sense?"), +] + +# Test Type Configurations +TEST_CONFIGS: Dict[TestType, TestTypeConfig] = { + TestType.EXCERPT: TestTypeConfig( + test_class="PostExcerptGeneratorTests", + marker_prefix="EXCERPT_OUTPUT", + icon="🖥️", + name="Excerpt Generation", + criteria=EXCERPT_CRITERIA, + pass_threshold=7.0, + needs_improvement_threshold=6.0, + total_weight=12.0, # Updated: removed lengthAppropriate (1.0) and styleMatch (1.0) + diversity_check=True, + ), + TestType.TAG: TestTypeConfig( + test_class="TagSuggestionGeneratorTests", + marker_prefix="TAG_OUTPUT", + icon="🏷️", + name="Tag Suggestion", + criteria=TAG_CRITERIA, + pass_threshold=7.5, + needs_improvement_threshold=6.5, + total_weight=7.5, + diversity_check=False, + ), + TestType.SUMMARY: TestTypeConfig( + test_class="PostSummaryGeneratorTests", + marker_prefix="SUMMARY_OUTPUT", + icon="📝", + name="Post Summary", + criteria=SUMMARY_CRITERIA, + pass_threshold=7.5, + needs_improvement_threshold=6.5, + total_weight=10.0, + diversity_check=False, + ), +} + + +def get_config(test_type: TestType) -> TestTypeConfig: + """Get configuration for test type""" + return TEST_CONFIGS[test_type] + + +def detect_test_type(test_type_str: str) -> TestType: + """Detect test type from JSON testType field""" + mapping = { + "excerpt-generation": TestType.EXCERPT, + "tag-suggestion": TestType.TAG, + "post-summary": TestType.SUMMARY, + } + return mapping.get(test_type_str, TestType.EXCERPT) diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/console_reporter.py b/Modules/Tests/WordPressIntelligenceTests/lib/console_reporter.py new file mode 100644 index 000000000000..5c05ea57ca3f --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/console_reporter.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 +""" +Console report generator +Displays formatted evaluation summary in the terminal +""" + +import json +import sys +from pathlib import Path +from typing import Dict, Any, List + + +class Colors: + RED = '\033[0;31m' + GREEN = '\033[0;32m' + YELLOW = '\033[1;33m' + BLUE = '\033[0;34m' + CYAN = '\033[0;36m' + NC = '\033[0m' + + +def get_score_indicator(score: float) -> str: + """Get score indicator with emoji""" + if score >= 9.0: + return "⭐️ Excellent" + elif score >= 8.0: + return "⭐️ Great" + elif score >= 7.0: + return "✅ Good" + elif score >= 6.0: + return "⚠️ Needs Work" + else: + return "❌ Poor" + + +def display_console_summary(json_file: Path, xcresult_path: str = None): + """Display evaluation summary to console""" + + if not json_file.exists(): + print(f"{Colors.RED}Error: JSON report not found: {json_file}{Colors.NC}") + return 1 + + # Load JSON report + with open(json_file) as f: + report = json.load(f) + + metadata = report.get('metadata', {}) + summary = report.get('summary', {}) + results = report.get('results', []) + + test_type = metadata.get('testType', 'excerpt-generation') + suite_name = metadata.get('suite', '') + + # Extract summary data + total = summary.get('total', 0) + passed = summary.get('passed', 0) + failed = summary.get('failed', 0) + needs_improvement = summary.get('needsImprovement', 0) + avg_score = summary.get('averageScore', 0) + pass_rate = summary.get('passRate', 0) * 100 + + # Display header + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + + if test_type == 'tag-suggestion': + print(f"{Colors.BLUE}🏷️ Tag Suggestion Evaluation - Summary{Colors.NC}") + elif test_type == 'post-summary': + print(f"{Colors.BLUE}📝 Post Summary Evaluation - Summary{Colors.NC}") + else: + print(f"{Colors.BLUE}🖥️ Excerpt Generation Evaluation - Summary{Colors.NC}") + + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + print() + + # Overall statistics + print(f"{Colors.BLUE}📊 Overall Statistics{Colors.NC}") + print(f" Total Test Cases: {total}") + print(f" {Colors.GREEN}✅ Passed: {passed} ({pass_rate:.1f}%){Colors.NC}") + + if failed > 0: + fail_pct = (failed * 100 / total) if total > 0 else 0 + print(f" {Colors.RED}❌ Failed: {failed} ({fail_pct:.1f}%){Colors.NC}") + + if needs_improvement > 0: + needs_pct = (needs_improvement * 100 / total) if total > 0 else 0 + print(f" {Colors.YELLOW}⚠️ Needs Work: {needs_improvement} ({needs_pct:.1f}%){Colors.NC}") + + print(f" Average Score: {avg_score:.1f}/10") + print() + + # Category averages (test type specific) + avg_by_category = summary.get('averageByCategory', {}) + + if test_type == 'tag-suggestion': + display_tag_categories(avg_by_category) + elif test_type == 'post-summary': + display_summary_categories(avg_by_category) + else: + display_excerpt_categories(avg_by_category) + + print() + + # Failed tests + if failed > 0: + display_failed_tests(results, failed) + + # Needs improvement tests + if needs_improvement > 0: + display_needs_improvement_tests(results, needs_improvement) + + # Prompt improvement suggestions + print(f"{Colors.BLUE}💡 Prompt Improvement Suggestions{Colors.NC}") + + if test_type == 'tag-suggestion': + display_tag_suggestions(results) + elif test_type == 'post-summary': + display_summary_suggestions(results) + else: + display_excerpt_suggestions(results) + + # File paths + print(f"{Colors.BLUE}📁 Detailed Results{Colors.NC}") + + html_file = json_file.parent / "evaluation-report.html" + if html_file.exists(): + print(f" HTML: {html_file}") + + print(f" JSON: {json_file}") + + if xcresult_path and Path(xcresult_path).exists(): + print(f" Tests: {xcresult_path}") + + print() + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + + +def display_tag_categories(avg_by_category: Dict[str, float]): + """Display tag-specific categories""" + print(f"{Colors.BLUE}📈 Category Averages (across all tags){Colors.NC}") + + categories = [ + ('relevance', 'Relevance'), + ('languageMatch', 'Language Match'), + ('formatConsistency', 'Format Consistency'), + ('seoQuality', 'SEO Quality'), + ('uniqueness', 'Uniqueness'), + ('specificity', 'Specificity'), + ] + + for key, label in categories: + score = avg_by_category.get(key, 0) + indicator = get_score_indicator(score) + print(f" {label:20} {score:.1f}/10 {indicator}") + + +def display_summary_categories(avg_by_category: Dict[str, float]): + """Display summary-specific categories""" + print(f"{Colors.BLUE}📈 Category Averages (across all summaries){Colors.NC}") + + categories = [ + ('conciseness', 'Conciseness'), + ('accuracy', 'Accuracy'), + ('languageMatch', 'Language Match'), + ('clarity', 'Clarity'), + ('completeness', 'Completeness'), + ('neutralTone', 'Neutral Tone'), + ('coherence', 'Coherence'), + ] + + for key, label in categories: + score = avg_by_category.get(key, 0) + indicator = get_score_indicator(score) + print(f" {label:20} {score:.1f}/10 {indicator}") + + +def display_excerpt_categories(avg_by_category: Dict[str, float]): + """Display excerpt-specific categories""" + print(f"{Colors.BLUE}📈 Category Averages (across all excerpts){Colors.NC}") + + categories = [ + ('languageMatch', 'Language Match'), + ('grammar', 'Grammar'), + ('relevance', 'Relevance'), + ('lengthAppropriate', 'Length'), + ('standalone', 'Standalone'), + ('styleMatch', 'Style Match'), + ('engagement', 'Engagement'), + ] + + for key, label in categories: + score = avg_by_category.get(key, 0) + indicator = get_score_indicator(score) + print(f" {label:20} {score:.1f}/10 {indicator}") + + # Diversity (if present and > 0) + diversity = avg_by_category.get('diversity', 0) + if diversity > 0: + indicator = get_score_indicator(diversity) + print(f" {'Diversity':20} {diversity:.1f}/10 {indicator}") + + +def display_failed_tests(results: List[Dict], failed_count: int): + """Display failed tests""" + print(f"{Colors.RED}❌ Failed Tests ({failed_count}){Colors.NC}") + + for result in results: + if result.get('status') != 'failed': + continue + + test_name = result.get('testName', 'Unknown') + avg_score = result.get('averageScore', 0) + lowest_score = result.get('lowestScore', 0) + diversity = result.get('diversity', {}).get('score', 0) + + print(f" • {test_name}") + print(f" Avg: {avg_score:.1f}/10 | Min: {lowest_score:.1f}/10") + + if diversity > 0 and diversity < 4.0: + print(f" {Colors.YELLOW}Low Diversity: {diversity:.1f}/10{Colors.NC}") + + print() + + +def display_needs_improvement_tests(results: List[Dict], needs_count: int): + """Display tests that need improvement""" + print(f"{Colors.YELLOW}⚠️ Needs Improvement ({needs_count}){Colors.NC}") + + for result in results: + if result.get('status') != 'needsImprovement': + continue + + test_name = result.get('testName', 'Unknown') + avg_score = result.get('averageScore', 0) + lowest_score = result.get('lowestScore', 0) + diversity = result.get('diversity', {}).get('score', 0) + + print(f" • {test_name}") + print(f" Avg: {avg_score:.1f}/10 | Min: {lowest_score:.1f}/10") + + if diversity > 0 and diversity < 5.0: + print(f" Low Diversity: {diversity:.1f}/10") + + print() + + +def display_tag_suggestions(results: List[Dict]): + """Display tag-specific improvement suggestions""" + suggestion_count = 0 + + # Count low scores across all tags + low_relevance = sum( + 1 for result in results + for tag in result.get('tags', []) + if tag.get('scores', {}).get('relevance', 10) < 6 + ) + + low_lang = sum( + 1 for result in results + for tag in result.get('tags', []) + if tag.get('scores', {}).get('languageMatch', 10) < 7 + ) + + low_format = sum( + 1 for result in results + for tag in result.get('tags', []) + if tag.get('scores', {}).get('formatConsistency', 10) < 7 + ) + + if low_relevance > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Relevance: Generate more relevant tags (affects {low_relevance} tags)") + print() + + if low_lang > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Language Enforcement (affects {low_lang} tags)") + print() + + if low_format > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Format Consistency: Match site tag formatting better (affects {low_format} tags)") + print() + + if suggestion_count == 0: + print(" No significant issues found!") + print() + + +def display_summary_suggestions(results: List[Dict]): + """Display summary-specific improvement suggestions""" + suggestion_count = 0 + + low_conciseness = sum( + 1 for result in results + if result.get('summary', {}).get('scores', {}).get('conciseness', 10) < 6 + ) + + low_accuracy = sum( + 1 for result in results + if result.get('summary', {}).get('scores', {}).get('accuracy', 10) < 6 + ) + + low_lang = sum( + 1 for result in results + if result.get('summary', {}).get('scores', {}).get('languageMatch', 10) < 7 + ) + + if low_conciseness > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Conciseness: Generate more concise summaries (affects {low_conciseness} summaries)") + print() + + if low_accuracy > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Accuracy: Capture main points more accurately (affects {low_accuracy} summaries)") + print() + + if low_lang > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Language Enforcement (affects {low_lang} summaries)") + print() + + if suggestion_count == 0: + print(" No significant issues found!") + print() + + +def display_excerpt_suggestions(results: List[Dict]): + """Display excerpt-specific improvement suggestions""" + suggestion_count = 0 + + low_lang = sum( + 1 for result in results + for excerpt in result.get('excerpts', []) + if excerpt.get('scores', {}).get('languageMatch', 10) < 8 + ) + + low_diversity = sum( + 1 for result in results + if 0 < result.get('diversity', {}).get('score', 10) < 5 + ) + + if low_lang > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Language Enforcement (affects {low_lang} excerpts)") + print() + + if low_diversity > 0: + suggestion_count += 1 + print(f" {suggestion_count}. Diversity: Generate more varied excerpts (affects {low_diversity} test cases)") + print() + + if suggestion_count == 0: + print(" No significant issues found!") + print() + + +def main(): + if len(sys.argv) < 2: + print("Usage: console_reporter.py [xcresult_path]") + sys.exit(1) + + json_file = Path(sys.argv[1]) + xcresult_path = sys.argv[2] if len(sys.argv) > 2 else None + + display_console_summary(json_file, xcresult_path) + + +if __name__ == "__main__": + main() diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/evaluate.py b/Modules/Tests/WordPressIntelligenceTests/lib/evaluate.py new file mode 100755 index 000000000000..3e58a4e37048 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/evaluate.py @@ -0,0 +1,523 @@ +#!/usr/bin/env python3 +""" +Evaluation Pipeline for WordPressIntelligence Tests +Runs xcodebuild tests, extracts outputs, evaluates with Claude, generates reports + +Usage: + ./evaluate.py # Run all excerpt tests (default) + ./evaluate.py --test-type tags # Run all tag tests + ./evaluate.py --skip-tests # Only evaluate existing JSON + ./evaluate.py --simulator "iPhone 15" # Use specific simulator + ./evaluate.py --only-testing "PostExcerptGeneratorTests/spanishHTMLContent()" + + # Legacy mode (for backward compatibility): + ./evaluate.py --test-output file.txt --output-dir /path +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +from datetime import datetime, UTC +from pathlib import Path +from typing import List, Dict, Any, Optional + +from config import TestType, detect_test_type, get_config +from extractors import extract_test_outputs +from claude_client import ClaudeClient +from evaluators import create_evaluator, TestResult + + +class Colors: + BLUE = '\033[0;34m' + CYAN = '\033[0;36m' + GREEN = '\033[0;32m' + YELLOW = '\033[1;33m' + NC = '\033[0m' + + +class TestConfig: + """Test type configuration""" + def __init__(self, test_class: str, icon: str, name: str): + self.test_class = test_class + self.icon = icon + self.name = name + + +# Test type configurations +TEST_CONFIGS = { + 'excerpts': TestConfig('PostExcerptGeneratorTests', '🖥️', 'Excerpt Generation'), + 'tags': TestConfig('TagSuggestionGeneratorTests', '🏷️', 'Tag Suggestion'), + 'summary': TestConfig('PostSummaryGeneratorTests', '📝', 'Post Summary'), +} + + +def print_header(config: TestConfig): + """Print pipeline header""" + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + print(f"{Colors.BLUE}{config.icon} {config.name} Evaluation Pipeline{Colors.NC}") + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + print("") + + +def print_step(num: int, message: str): + """Print step message""" + print(f"{Colors.CYAN}Step {num}: {message}{Colors.NC}") + print("") + + +def check_dependencies(): + """Check required dependencies are installed""" + if not shutil.which('claude'): + print(f"{Colors.YELLOW}Error: claude CLI not found{Colors.NC}") + print("Install with: pip install claude-cli && claude configure") + sys.exit(1) + + +def run_swift_tests( + modules_dir: Path, + xcresult_path: Path, + output_dir: Path, + simulator_name: str, + test_target: str, + config: TestConfig, +) -> Path: + """Run Swift tests and return path to test output file""" + print_step(1, f"Running Swift {config.name} tests...") + print("") + + # Create output directories + output_dir.mkdir(parents=True, exist_ok=True) + + # Remove existing xcresult bundle if it exists + if xcresult_path.exists(): + shutil.rmtree(xcresult_path) + + print("Running tests...") + + # Prepare xcodebuild command + cmd = [ + 'xcodebuild', 'test', + '-scheme', 'Modules-Package', + '-destination', f'platform=iOS Simulator,name={simulator_name},OS=26.2', + '-only-testing', test_target, + '-resultBundlePath', str(xcresult_path), + ] + + # Run tests with output capture + test_output_file = output_dir / 'swift-test-output.txt' + with open(test_output_file, 'w') as f: + env = {'NSUnbufferedIO': 'YES', **os.environ.copy()} + result = subprocess.run( + cmd, + cwd=modules_dir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + + # Write and display output + f.write(result.stdout) + print(result.stdout) + + print("") + if result.returncode == 0: + print(f"{Colors.GREEN}✓ Tests passed{Colors.NC}") + else: + print(f"{Colors.YELLOW}⚠ Some tests failed - continuing with evaluation{Colors.NC}") + + print("") + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + print("") + + return test_output_file + + +def convert_result_to_dict(result: TestResult, test_type: TestType) -> Dict[str, Any]: + """Convert TestResult to JSON-serializable dict""" + result_dict = { + "id": result.id, + "testName": result.test_name, + "testType": result.test_type, + "status": result.status, + "input": result.input_data, + "averageScore": result.average_score, + "lowestScore": result.lowest_score, + "duration": result.duration, + } + + # Add items based on test type + if test_type == TestType.EXCERPT: + result_dict["excerpts"] = [ + { + "number": item.number, + "text": item.text, + "wordCount": item.word_count, + "status": item.status, + "failureReason": item.failure_reason, + "scores": item.scores, + "overall": item.overall, + } + for item in result.items + ] + if result.diversity: + result_dict["diversity"] = result.diversity + elif test_type == TestType.TAG: + result_dict["tags"] = [ + { + "number": item.number, + "text": item.text, + "wordCount": item.word_count, + "status": item.status, + "failureReason": item.failure_reason, + "scores": item.scores, + "overall": item.overall, + } + for item in result.items + ] + elif test_type == TestType.SUMMARY: + if result.items: + item = result.items[0] + result_dict["summary"] = { + "text": item.text, + "wordCount": item.word_count, + "scores": item.scores, + "overall": item.overall, + } + + return result_dict + + +def calculate_category_averages(results: List[Dict[str, Any]], test_type: TestType) -> Dict[str, float]: + """Calculate average scores by category""" + config = get_config(test_type) + category_sums = {criterion.name: [] for criterion in config.criteria} + + # Extract scores based on test type + for result in results: + if test_type == TestType.EXCERPT: + for excerpt in result.get("excerpts", []): + scores = excerpt.get("scores", {}) + for criterion in config.criteria: + if criterion.name in scores: + category_sums[criterion.name].append(scores[criterion.name]) + elif test_type == TestType.TAG: + for tag in result.get("tags", []): + scores = tag.get("scores", {}) + for criterion in config.criteria: + if criterion.name in scores: + category_sums[criterion.name].append(scores[criterion.name]) + elif test_type == TestType.SUMMARY: + summary = result.get("summary", {}) + scores = summary.get("scores", {}) + for criterion in config.criteria: + if criterion.name in scores: + category_sums[criterion.name].append(scores[criterion.name]) + + # Calculate averages + averages = {} + for criterion in config.criteria: + values = category_sums[criterion.name] + averages[criterion.name] = sum(values) / len(values) if values else 0.0 + + # Add diversity for excerpts + if test_type == TestType.EXCERPT: + diversity_scores = [ + result.get("diversity", {}).get("score", 0) + for result in results + if result.get("diversity") and result["diversity"].get("score", 0) > 0 + ] + averages["diversity"] = sum(diversity_scores) / len(diversity_scores) if diversity_scores else 0.0 + + return averages + + +def generate_json_report(results: List[TestResult], output_file: Path, test_type: TestType, model: str = "sonnet", output_dir: Path = None): + """Generate JSON evaluation report""" + config = get_config(test_type) + + # Convert results to dicts + result_dicts = [convert_result_to_dict(r, test_type) for r in results] + + # Calculate statistics + total = len(result_dicts) + passed = sum(1 for r in result_dicts if r["status"] == "passed") + failed = sum(1 for r in result_dicts if r["status"] == "failed") + needs_improvement = sum(1 for r in result_dicts if r["status"] == "needsImprovement") + pass_rate = passed / total if total > 0 else 0.0 + + # Calculate average overall score + avg_overall = sum(r["averageScore"] for r in result_dicts) / total if total > 0 else 0.0 + + # Calculate category averages + category_averages = calculate_category_averages(result_dicts, test_type) + + # Build thresholds + thresholds = { + "pass": { + "overall": config.pass_threshold, + }, + "needsImprovement": { + "overall": config.needs_improvement_threshold, + }, + } + + # Add critical thresholds + for criterion in config.criteria: + if criterion.critical_threshold > 0: + thresholds["pass"][criterion.name] = criterion.critical_threshold + + # Generate report + report = { + "metadata": { + "suite": config.test_class.replace("Tests", "").lower(), + "testType": test_type.value, + "version": "2.0", + "timestamp": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), + "duration": sum(r["duration"] for r in result_dicts), + "claude_model": model, + "test_platform": "iOS Simulator", + "outputDirectory": str(output_dir.resolve()) if output_dir else None, + }, + "summary": { + "total": total, + "passed": passed, + "failed": failed, + "needsImprovement": needs_improvement, + "averageScore": avg_overall, + "averageByCategory": category_averages, + "passRate": pass_rate, + }, + "thresholds": thresholds, + "results": result_dicts, + } + + with open(output_file, 'w') as f: + json.dump(report, f, indent=2) + + print(f" ✓ JSON report: {output_file}") + + +def generate_html_report(script_dir: Path, eval_run_dir: Path, base_output_dir: Path): + """Generate HTML report with embedded data""" + viewer_html = script_dir / 'evaluation-viewer.html' + results_json = eval_run_dir / 'evaluation-results.json' + report_html = eval_run_dir / 'evaluation-report.html' + + if not viewer_html.exists() or not results_json.exists(): + return + + cmd = [ + sys.executable, + str(script_dir / 'inject-report-data.py'), + str(viewer_html), + str(results_json), + str(report_html), + ] + + result = subprocess.run(cmd, cwd=script_dir, capture_output=True) + + if result.returncode == 0: + print(f" ✓ HTML report: {report_html}") + print("") + print(" To compare with baselines:") + print(" 1. Open evaluation-report.html in your browser") + print(" 2. Check browser console for baseline folder location") + print(" 3. Click 'Compare with baseline' or drag and drop a baseline JSON file") + print(f" 4. Baseline directories: {base_output_dir}/evaluation-*/") + else: + print(" ✗ HTML report generation failed") + + +def show_console_summary(script_dir: Path, eval_run_dir: Path, xcresult_path: Path): + """Display console summary of results""" + console_reporter = script_dir / 'console_reporter.py' + results_json = eval_run_dir / 'evaluation-results.json' + + if not console_reporter.exists() or not results_json.exists(): + return + + cmd = [ + sys.executable, + str(console_reporter), + str(results_json), + str(xcresult_path), + ] + + subprocess.run(cmd, cwd=script_dir) + + +def main(): + parser = argparse.ArgumentParser( + description='Evaluation pipeline for WordPressIntelligence tests' + ) + + # New-style arguments (full pipeline) + parser.add_argument( + '--test-type', + default='excerpts', + choices=['excerpts', 'tags', 'summary'], + help='Test type to run (default: excerpts)' + ) + parser.add_argument( + '--model', + default='sonnet', + choices=['sonnet', 'opus', 'haiku'], + help='Claude model to use (default: sonnet)' + ) + parser.add_argument( + '--skip-tests', + action='store_true', + help='Skip test execution and use existing output' + ) + parser.add_argument( + '--simulator', + default='iPhone 17 Pro', + help='Simulator name (default: iPhone 17 Pro)' + ) + parser.add_argument( + '--only-testing', + default='', + help='Run only specific test (e.g., "PostExcerptGeneratorTests/spanishHTMLContent()")' + ) + + # Legacy arguments (for backward compatibility) + parser.add_argument( + '--test-output', + help='Path to swift test output file (legacy mode)' + ) + parser.add_argument( + '--output-dir', + help='Output directory for results (legacy mode)' + ) + + args = parser.parse_args() + + # Check dependencies + check_dependencies() + + # Determine if running in legacy mode + legacy_mode = args.test_output is not None and args.output_dir is not None + + if legacy_mode: + # Legacy mode: just extract and evaluate + test_output_file = Path(args.test_output) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + step_offset = 0 + else: + # New mode: full pipeline + script_dir = Path(__file__).parent.resolve() + project_root = script_dir.parent.parent.parent.parent + modules_dir = project_root / 'Modules' + tmp_dir = Path(os.environ.get('TMPDIR', '/tmp')) + base_output_dir = tmp_dir / 'WordPressIntelligence-Tests' + + # Create timestamped evaluation directory + timestamp = datetime.now().strftime('%Y-%m-%d-%H%M%S') + output_dir = base_output_dir / f'evaluation-{timestamp}' + + # Get test configuration + config = TEST_CONFIGS.get(args.test_type, TEST_CONFIGS['excerpts']) + + # Print header + print_header(config) + print(f"Test results: {base_output_dir}/test-results.xcresult") + print(f"Evaluation results: {output_dir}") + print("") + + # Setup paths + xcresult_path = base_output_dir / 'test-results.xcresult' + + # Step 1: Run Swift tests (unless skipped) + if not args.skip_tests: + # Determine test target + if args.only_testing: + test_target = f'WordPressIntelligenceTests/{args.only_testing}' + print(f" Only testing: {test_target}") + else: + test_target = f'WordPressIntelligenceTests/{config.test_class}' + + test_output_file = run_swift_tests( + modules_dir=modules_dir, + xcresult_path=xcresult_path, + output_dir=base_output_dir, + simulator_name=args.simulator, + test_target=test_target, + config=config, + ) + step_offset = 1 + else: + print(f"{Colors.YELLOW}Skipping test execution (using existing output){Colors.NC}") + print("") + output_dir.mkdir(parents=True, exist_ok=True) + test_output_file = base_output_dir / 'swift-test-output.txt' + step_offset = 1 + + # Step 2: Extract test outputs + print_step(step_offset + 1, "Extracting test outputs from console...") + try: + test_outputs = extract_test_outputs(test_output_file) + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + print("") + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + print("") + + # Step 3: Evaluate with Claude + print_step(step_offset + 2, f"Evaluating with Claude CLI (model: {args.model})...") + + claude = ClaudeClient(model=args.model) + all_results: List[TestResult] = [] + + for idx, test_data in enumerate(test_outputs, 1): + # Auto-detect test type + test_type_str = test_data.get('testType', 'excerpt-generation') + test_type = detect_test_type(test_type_str) + + # Create evaluator and evaluate + evaluator = create_evaluator(test_type, claude) + result = evaluator.evaluate_test(test_data, idx) + all_results.append(result) + + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + print("") + + # Step 4: Generate JSON report + print_step(step_offset + 3, "Generating evaluation reports...") + + # Detect test type from first result (all should be same type) + if all_results: + test_type_str = all_results[0].test_type + test_type = detect_test_type(test_type_str) + + json_output = output_dir / "evaluation-results.json" + generate_json_report(all_results, json_output, test_type, args.model, output_dir) + + print("") + + # Generate HTML report (only in new mode) + if not legacy_mode: + generate_html_report(script_dir, output_dir, base_output_dir) + + print("") + print(f"{Colors.BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━{Colors.NC}") + print("") + + # Display console summary + show_console_summary(script_dir, output_dir, xcresult_path) + + print("") + + # Return exit code based on failures + failed_count = sum(1 for r in all_results if r.status == "failed") + sys.exit(failed_count) + + +if __name__ == "__main__": + main() diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/evaluation-viewer.html b/Modules/Tests/WordPressIntelligenceTests/lib/evaluation-viewer.html new file mode 100644 index 000000000000..a6c44c493722 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/evaluation-viewer.html @@ -0,0 +1,2808 @@ + + + + + Evaluation Report Viewer + + + +
+ + + + +
+

No Report Loaded

+

Select an evaluation JSON file to begin

+
+
+ +
+
+ Drop baseline JSON file here to compare +
+
+ + + + diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/evaluators.py b/Modules/Tests/WordPressIntelligenceTests/lib/evaluators.py new file mode 100644 index 000000000000..d9299c046cd1 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/evaluators.py @@ -0,0 +1,526 @@ +#!/usr/bin/env python3 +""" +Evaluators for WordPressIntelligence test results + +This module provides the evaluation framework for assessing AI-generated content +(excerpts, tags, summaries) using Claude as an LLM judge. Each evaluator: +1. Extracts generated items from test output +2. Builds evaluation prompts with criteria +3. Sends to Claude for scoring +4. Determines pass/fail/needs-improvement status + +Classes: + BaseEvaluator: Abstract base with shared evaluation logic + ExcerptEvaluator: Evaluates post excerpts with diversity checking + TagEvaluator: Evaluates tag suggestions + SummaryEvaluator: Evaluates post summaries +""" + +from __future__ import annotations +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Dict, Any, List + +from config import TestTypeConfig, TestType, get_config +from claude_client import ClaudeClient + + +# ANSI color codes for terminal output +class Colors: + RED = '\033[0;31m' + GREEN = '\033[0;32m' + YELLOW = '\033[1;33m' + BLUE = '\033[0;34m' + CYAN = '\033[0;36m' + NC = '\033[0m' + + +@dataclass +class EvaluationResult: + """Result of evaluating a single generated item + + Attributes: + number: Item number (1-indexed) + text: The generated text (excerpt/tag/summary) + scores: Per-criterion scores (e.g., {"relevance": 8.5, "clarity": 9.0}) + overall: Weighted overall score (0-10) + status: "passed", "needsImprovement", or "failed" + failure_reason: Why it failed/needs improvement (empty if passed) + word_count: Number of words in the text + """ + number: int + text: str + scores: Dict[str, float] + overall: float + status: str + failure_reason: str = "" + word_count: int = 0 + + +@dataclass +class TestResult: + """Complete test case result with all evaluated items + + Attributes: + id: Test identifier (e.g., "test_1") + test_name: Human-readable test name + test_type: Type of test (excerpt-generation, tag-suggestion, etc.) + status: Overall status (worst of all items) + input_data: Original input metadata (language, content preview, etc.) + items: All evaluated items (excerpts, tags, or summary) + average_score: Mean score across all items + lowest_score: Minimum score across all items + duration: Test execution time in seconds + diversity: Diversity evaluation (excerpts only, optional) + """ + id: str + test_name: str + test_type: str + status: str + input_data: Dict[str, Any] + items: List[EvaluationResult] + average_score: float + lowest_score: float + duration: float + diversity: Dict[str, Any] | None = None + + +class BaseEvaluator(ABC): + """Abstract base class for test evaluators + + Subclasses implement extract_items() and build_evaluation_prompt() + to customize for different test types (excerpts, tags, summaries). + """ + + def __init__(self, config: TestTypeConfig, claude: ClaudeClient): + """Initialize evaluator with config and Claude client""" + self.config = config + self.claude = claude + + @abstractmethod + def build_evaluation_prompt(self, test_data: Dict[str, Any], item: str) -> str: + """Build Claude evaluation prompt for a single item + + Args: + test_data: Full test data including original content and metadata + item: Single item to evaluate (excerpt/tag/summary text) + + Returns: + Formatted prompt string requesting JSON scores + """ + pass + + @abstractmethod + def extract_items(self, test_data: Dict[str, Any]) -> List[str]: + """Extract items to evaluate from test output + + Args: + test_data: Full test data from Swift tests + + Returns: + List of text items to evaluate (excerpts, tags, or [summary]) + """ + pass + + def evaluate_test(self, test_data: Dict[str, Any], result_id: int) -> TestResult: + """Evaluate a complete test case with all generated items + + Args: + test_data: Full test output from Swift tests + result_id: Sequential test number for identification + + Returns: + Complete TestResult with all item scores and overall status + """ + test_name = test_data.get('testName', f'test_{result_id}') + language = test_data.get('language', 'unknown') + duration = test_data.get('duration', 0.0) + + print(f"{Colors.CYAN}Evaluating: {test_name}{Colors.NC}") + print(f" Type: {self.config.name} | Language: {language}") + + # Extract and validate items + items = self.extract_items(test_data) + if not items: + print(f" {Colors.RED}No items found{Colors.NC}") + return self._create_empty_result(test_data, result_id, duration) + + print(f" Found {len(items)} item(s)") + + # Evaluate each item with Claude + item_results = [ + result for idx, item in enumerate(items, 1) + if (result := self.evaluate_item(test_data, item, idx, len(items))) + ] + + if not item_results: + return self._create_empty_result(test_data, result_id, duration) + + # Calculate aggregate scores + avg_score = sum(r.overall for r in item_results) / len(item_results) + lowest_score = min(r.overall for r in item_results) + status = self._calculate_overall_status(item_results) + + # Check diversity for excerpts (if enabled and multiple items) + diversity = None + if self.config.diversity_check and len(items) >= 2: + diversity = self.evaluate_diversity(items) + if diversity and diversity.get('score', 0) < 4.0: + status = "needsImprovement" + + print(f" {Colors.CYAN}Overall: {avg_score:.1f}/10 avg, {lowest_score:.1f}/10 min{Colors.NC}") + print("") + + return TestResult( + id=f"test_{result_id}", + test_name=test_name, + test_type=test_data.get('testType', self.config.name.lower()), + status=status, + input_data=self._extract_input_data(test_data), + items=item_results, + average_score=avg_score, + lowest_score=lowest_score, + duration=duration, + diversity=diversity, + ) + + def evaluate_item(self, test_data: Dict[str, Any], item: str, num: int, total: int) -> EvaluationResult | None: + """Evaluate a single item using Claude as judge + + Args: + test_data: Full test data for context + item: Text to evaluate + num: Item number (1-indexed) + total: Total number of items + + Returns: + EvaluationResult if successful, None if Claude evaluation fails + """ + preview = f"{item[:50]}..." if len(item) > 50 else item + print(f" {Colors.BLUE}Item {num}/{total}{Colors.NC} ({preview})") + + # Build and send evaluation prompt to Claude + prompt = self.build_evaluation_prompt(test_data, item) + scores = self.claude.evaluate(prompt) + + if not scores: + print(f" {Colors.RED}✗ Evaluation failed{Colors.NC}") + return None + + # Calculate weighted score and determine status + overall = self._calculate_weighted_score(scores) + status, failure_reason = self._determine_status(scores, overall) + self._display_status(status, overall, failure_reason) + + return EvaluationResult( + number=num, + text=item, + scores=scores, + overall=overall, + status=status, + failure_reason=failure_reason, + word_count=len(item.split()), + ) + + def _calculate_weighted_score(self, scores: Dict[str, float]) -> float: + """Calculate weighted average of criterion scores (0-10)""" + weighted_sum = sum( + scores.get(c.name, 0.0) * c.weight + for c in self.config.criteria + ) + return weighted_sum / self.config.total_weight + + def _determine_status(self, scores: Dict[str, float], overall: float) -> tuple[str, str]: + """Determine pass/needsImprovement/failed status + + Returns: + (status, failure_reason) tuple + - status: "passed", "needsImprovement", or "failed" + - failure_reason: Explanation if not passed, empty string if passed + """ + # Check critical thresholds (any failure = test fails) + for criterion in self.config.criteria: + if criterion.critical_threshold > 0: + score = scores.get(criterion.name, 0.0) + if score < criterion.critical_threshold: + return "failed", f"Low {criterion.name} ({score:.1f}/10)" + + # Check overall score thresholds + if overall >= self.config.pass_threshold: + return "passed", "" + if overall >= self.config.needs_improvement_threshold: + return "needsImprovement", "Below target score" + return "failed", f"Low overall score ({overall:.1f}/10)" + + def _display_status(self, status: str, score: float, failure_reason: str): + """Display colored status message to console""" + status_icons = { + "passed": (Colors.GREEN, "✓"), + "needsImprovement": (Colors.YELLOW, "⚠"), + "failed": (Colors.RED, "✗"), + } + color, icon = status_icons.get(status, (Colors.RED, "✗")) + reason_text = f" - {failure_reason}" if failure_reason else "" + print(f" {color}{icon} {score:.1f}/10{Colors.NC}{reason_text}") + + def _calculate_overall_status(self, item_results: List[EvaluationResult]) -> str: + """Calculate overall status from item results (worst status wins)""" + if any(r.status == "failed" for r in item_results): + return "failed" + if any(r.status == "needsImprovement" for r in item_results): + return "needsImprovement" + return "passed" + + def _extract_input_data(self, test_data: Dict[str, Any]) -> Dict[str, Any]: + """Extract common input metadata for JSON result + + Subclasses can override to add type-specific fields + """ + return { + "language": test_data.get('language', 'unknown'), + "originalContent": test_data.get('originalContent', ''), + "originalContentPreview": test_data.get('originalContent', '')[:200], + } + + def _create_empty_result(self, test_data: Dict[str, Any], result_id: int, duration: float) -> TestResult: + """Create failed result when no items found or all evaluations failed""" + return TestResult( + id=f"test_{result_id}", + test_name=test_data.get('testName', f'test_{result_id}'), + test_type=test_data.get('testType', self.config.name.lower()), + status="failed", + input_data=self._extract_input_data(test_data), + items=[], + average_score=0.0, + lowest_score=0.0, + duration=duration, + ) + + def evaluate_diversity(self, items: List[str]) -> Dict[str, Any] | None: + """Evaluate diversity across multiple generated items + + Used for excerpts to ensure meaningful variation between options. + Scores structural, angle, length, and lexical diversity. + + Args: + items: List of generated items (excerpts) + + Returns: + {"score": float, "feedback": str} or None if evaluation fails + """ + print(f" {Colors.BLUE}Checking diversity...{Colors.NC}") + + items_text = "\n".join(f"Item {i+1}: {item}" for i, item in enumerate(items)) + + prompt = f"""Evaluate the diversity of these {len(items)} variations. They should offer meaningful choices. + +{items_text} + +Score diversity across dimensions: +1. Structural Diversity: Different opening styles? +2. Angle Diversity: Different aspects emphasized? +3. Length Diversity: Varied sentence lengths? +4. Lexical Diversity: Different vocabulary? + +Good diversity (7-10): Clearly distinct approaches, different hooks, reader can make meaningful choice +Poor diversity (1-6): Too similar, same structure, minor wording changes only + +Respond with JSON only: +{{ + "structural": , + "angle": , + "length": , + "lexical": , + "overall": , + "feedback": "" +}}""" + + result = self.claude.evaluate(prompt) + if result: + score = result.get('overall', 0) + print(f" {Colors.CYAN}Diversity: {score:.1f}/10{Colors.NC}") + return {"score": score, "feedback": result.get('feedback', 'N/A')} + + return None + + +class ExcerptEvaluator(BaseEvaluator): + """Evaluator for post excerpt generation + + Evaluates generated excerpts against criteria like relevance, readability, + and engagement. Also checks diversity when multiple excerpts are generated. + """ + + def extract_items(self, test_data: Dict[str, Any]) -> List[str]: + """Extract excerpts array from test data""" + return test_data.get('excerpts', []) + + def build_evaluation_prompt(self, test_data: Dict[str, Any], excerpt: str) -> str: + """Build Claude prompt to evaluate a single excerpt""" + content_preview = test_data.get('originalContent', '')[:500] + language = test_data.get('language', 'unknown') + style = test_data.get('style', 'unknown') + length = test_data.get('length', 'unknown') + + criteria_lines = "\n".join( + f"- {c.name} (1-10): {c.description}" + for c in self.config.criteria + ) + json_fields = ', '.join(f'"{c.name}": ' for c in self.config.criteria) + + return f"""Evaluate this excerpt and respond with JSON only: + +Original Content (first 500 chars): +{content_preview}... + +Generated Excerpt: +{excerpt} + +Expected: {language} language, {length} length, {style} style + +Rate each criterion 1.0-10.0 (use decimals for precision, e.g., 7.3, 8.6): +- Very few things deserve a perfect 10.0 - reserve this for truly exceptional quality +- Be critical and nuanced in your scoring - use the full range + +{criteria_lines} + +Respond in this exact JSON format (scores must be numbers with one decimal place): +{{ + {json_fields}, + "feedback": "" +}}""" + + def _extract_input_data(self, test_data: Dict[str, Any]) -> Dict[str, Any]: + """Add excerpt-specific metadata (style, length)""" + data = super()._extract_input_data(test_data) + data.update({ + "style": test_data.get('style', 'unknown'), + "length": test_data.get('length', 'unknown'), + }) + return data + + +class TagEvaluator(BaseEvaluator): + """Evaluator for tag suggestions + + Evaluates generated tags for relevance, appropriateness for the site's + existing taxonomy, and whether they enhance discoverability. + """ + + def extract_items(self, test_data: Dict[str, Any]) -> List[str]: + """Extract tags array from test data""" + return test_data.get('tags', []) + + def build_evaluation_prompt(self, test_data: Dict[str, Any], tag: str) -> str: + """Build Claude prompt to evaluate a single tag""" + content_preview = test_data.get('originalContent', '')[:500] + language = test_data.get('language', 'unknown') + site_tags = ', '.join(test_data.get('siteTags', [])) + existing_tags = ', '.join(test_data.get('existingPostTags', [])) + + criteria_lines = "\n".join( + f"- {c.name} (1-10): {c.description}" + for c in self.config.criteria + ) + json_fields = ', '.join(f'"{c.name}": ' for c in self.config.criteria) + + return f"""Evaluate this tag suggestion and respond with JSON only: + +Original Content (first 500 chars): +{content_preview}... + +Site Tags: {site_tags} +Existing Post Tags: {existing_tags} + +Generated Tag: {tag} + +Expected: {language} language + +Rate each criterion 1.0-10.0 (use decimals for precision, e.g., 7.3, 8.6): +- Very few things deserve a perfect 10.0 - reserve this for truly exceptional quality +- Be critical and nuanced in your scoring - use the full range + +{criteria_lines} + +Respond in this exact JSON format (scores must be numbers with one decimal place): +{{ + {json_fields}, + "feedback": "" +}}""" + + def _extract_input_data(self, test_data: Dict[str, Any]) -> Dict[str, Any]: + """Add tag-specific metadata (site tags, existing post tags)""" + data = super()._extract_input_data(test_data) + data.update({ + "siteTags": ', '.join(test_data.get('siteTags', [])), + "existingPostTags": ', '.join(test_data.get('existingPostTags', [])), + }) + return data + + +class SummaryEvaluator(BaseEvaluator): + """Evaluator for post summaries + + Evaluates generated summaries for accuracy, completeness, + and conciseness in capturing the post's key points. + """ + + def extract_items(self, test_data: Dict[str, Any]) -> List[str]: + """Extract summary from test data (returns single-item list or empty list)""" + summary = test_data.get('summary') + return [summary] if summary else [] + + def build_evaluation_prompt(self, test_data: Dict[str, Any], summary: str) -> str: + """Build Claude prompt to evaluate the summary""" + content_preview = test_data.get('originalContent', '')[:500] + language = test_data.get('language', 'unknown') + + criteria_lines = "\n".join( + f"- {c.name} (1-10): {c.description}" + for c in self.config.criteria + ) + json_fields = ', '.join(f'"{c.name}": ' for c in self.config.criteria) + + return f"""Evaluate this post summary and respond with JSON only: + +Original Content (first 500 chars): +{content_preview}... + +Generated Summary: +{summary} + +Expected: {language} language + +Rate each criterion 1.0-10.0 (use decimals for precision, e.g., 7.3, 8.6): +- Very few things deserve a perfect 10.0 - reserve this for truly exceptional quality +- Be critical and nuanced in your scoring - use the full range + +{criteria_lines} + +Respond in this exact JSON format (scores must be numbers with one decimal place): +{{ + {json_fields}, + "feedback": "" +}}""" + + +def create_evaluator(test_type: TestType, claude: ClaudeClient) -> BaseEvaluator: + """Factory function to create appropriate evaluator for test type + + Args: + test_type: Type of test (EXCERPT, TAG, or SUMMARY) + claude: ClaudeClient instance for LLM evaluation + + Returns: + Concrete evaluator instance (ExcerptEvaluator, TagEvaluator, or SummaryEvaluator) + """ + config = get_config(test_type) + + evaluators = { + TestType.EXCERPT: ExcerptEvaluator, + TestType.TAG: TagEvaluator, + TestType.SUMMARY: SummaryEvaluator, + } + + evaluator_class = evaluators[test_type] + return evaluator_class(config, claude) diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/extractors.py b/Modules/Tests/WordPressIntelligenceTests/lib/extractors.py new file mode 100644 index 000000000000..39e419e2b1b0 --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/extractors.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +Test output extraction from console logs +Extracts base64-encoded JSON from test output markers +""" + +import base64 +import json +import re +from pathlib import Path +from typing import List, Dict, Any + + +def extract_test_outputs(test_output_file: Path) -> List[Dict[str, Any]]: + """ + Extract test outputs from console log file + + Args: + test_output_file: Path to swift test output file + + Returns: + List of parsed JSON test outputs + """ + if not test_output_file.exists(): + raise FileNotFoundError(f"Test output file not found: {test_output_file}") + + outputs = [] + in_block = False + current_content = [] + + # Markers for all test types + start_markers = [ + "__EXCERPT_OUTPUT_START__", + "__TAG_OUTPUT_START__", + "__SUMMARY_OUTPUT_START__", + ] + end_markers = [ + "__EXCERPT_OUTPUT_END__", + "__TAG_OUTPUT_END__", + "__SUMMARY_OUTPUT_END__", + ] + + with open(test_output_file, 'r') as f: + for line in f: + line = line.rstrip('\n') + + # Check for start markers + if any(marker in line for marker in start_markers): + in_block = True + current_content = [] + continue + + # Check for end markers + if any(marker in line for marker in end_markers): + if in_block and current_content: + # Join accumulated base64 content and decode + base64_content = ''.join(current_content) + try: + json_bytes = base64.b64decode(base64_content) + json_str = json_bytes.decode('utf-8') + test_output = json.loads(json_str) + outputs.append(test_output) + except Exception as e: + print(f"Warning: Failed to decode output: {e}") + + in_block = False + current_content = [] + continue + + # Accumulate base64 content + if in_block: + current_content.append(line.strip()) + + if not outputs: + raise ValueError( + "No test outputs extracted. " + "This may mean:\n" + " - Tests failed before recording outputs\n" + " - Output markers weren't found in console\n" + f" - Check {test_output_file} for __*_OUTPUT_START__ markers" + ) + + print(f"Extracted {len(outputs)} test outputs") + return outputs diff --git a/Modules/Tests/WordPressIntelligenceTests/lib/inject-report-data.py b/Modules/Tests/WordPressIntelligenceTests/lib/inject-report-data.py new file mode 100755 index 000000000000..620d430add6f --- /dev/null +++ b/Modules/Tests/WordPressIntelligenceTests/lib/inject-report-data.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Inject evaluation results JSON into the HTML viewer template. +""" + +import json +import sys +import argparse + +def inject_data(template_path, json_path, output_path, baseline_path=None): + # Read the HTML template + with open(template_path, 'r') as f: + html_content = f.read() + + # Read the JSON data + with open(json_path, 'r') as f: + json_data = json.load(f) + + # Convert JSON to a compact string + json_string = json.dumps(json_data, separators=(',', ':')) + + # Replace the EMBEDDED_DATA placeholder + placeholder = 'const EMBEDDED_DATA = null;' + replacement = f'const EMBEDDED_DATA = {json_string};' + + if placeholder not in html_content: + print("ERROR: Could not find EMBEDDED_DATA placeholder in HTML template", file=sys.stderr) + return False + + html_with_data = html_content.replace(placeholder, replacement) + + # If baseline is provided, inject it as well + if baseline_path: + with open(baseline_path, 'r') as f: + baseline_data = json.load(f) + + baseline_string = json.dumps(baseline_data, separators=(',', ':')) + + # Add EMBEDDED_BASELINE placeholder in the template, just before EMBEDDED_DATA + baseline_placeholder = '// Note: EMBEDDED_BASELINE may also be injected if comparison is requested' + if baseline_placeholder in html_with_data: + # Insert the baseline constant before the note comment + baseline_line = f'const EMBEDDED_BASELINE = {baseline_string};\n ' + html_with_data = html_with_data.replace(baseline_placeholder, baseline_line + baseline_placeholder) + print(f"✓ Injected {len(json_string)} bytes of main data + {len(baseline_string)} bytes of baseline data") + else: + print("Warning: Could not find baseline placeholder, baseline not injected", file=sys.stderr) + print(f"✓ Injected {len(json_string)} bytes of main data (baseline injection failed)") + else: + print(f"✓ Injected {len(json_string)} bytes of JSON data into HTML report") + + # Write the output + with open(output_path, 'w') as f: + f.write(html_with_data) + + return True + +def main(): + parser = argparse.ArgumentParser(description='Inject evaluation data into HTML report') + parser.add_argument('template', help='HTML template file') + parser.add_argument('data', help='Main evaluation JSON file') + parser.add_argument('output', help='Output HTML file') + parser.add_argument('--baseline', help='Optional baseline JSON file for comparison', default=None) + + args = parser.parse_args() + + success = inject_data(args.template, args.data, args.output, args.baseline) + sys.exit(0 if success else 1) + +if __name__ == '__main__': + main() diff --git a/Modules/Tests/WordPressSharedTests/GutenbergExcerptGeneratorTests.swift b/Modules/Tests/WordPressSharedTests/GutenbergExcerptGeneratorTests.swift index e6747a83e5fd..eff9361a7686 100644 --- a/Modules/Tests/WordPressSharedTests/GutenbergExcerptGeneratorTests.swift +++ b/Modules/Tests/WordPressSharedTests/GutenbergExcerptGeneratorTests.swift @@ -1,7 +1,7 @@ import Testing @testable import WordPressShared -struct GutenbergExcerptGeneratorTests { +struct GutenbergPostExcerptGeneratorTests { @Test func summaryForContent() { let content = "

Lorem ipsum dolor sit amet, [shortcode param=\"value\"]consectetur[/shortcode] adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.

Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

" diff --git a/Modules/Tests/WordPressSharedTests/IntelligenceServiceTests.swift b/Modules/Tests/WordPressSharedTests/IntelligenceServiceTests.swift deleted file mode 100644 index b9f4a8adccf6..000000000000 --- a/Modules/Tests/WordPressSharedTests/IntelligenceServiceTests.swift +++ /dev/null @@ -1,15 +0,0 @@ -import Testing -@testable import WordPressShared - -struct IntelligenceServiceTests { - @available(iOS 26, *) - @Test(.disabled("only for local testing")) - func suggestTags() async throws { - let tags = try await IntelligenceService() - .suggestTags( - post: IntelligenceUtilities.post, - siteTags: ["cooking", "healthy-foods"] - ) - print(tags) - } -} diff --git a/RELEASE-NOTES.txt b/RELEASE-NOTES.txt index 1eb9adfb1ca9..cef50b8bdb9f 100644 --- a/RELEASE-NOTES.txt +++ b/RELEASE-NOTES.txt @@ -1,6 +1,6 @@ 26.6 ----- - +* [**] [Intelligence] Expand AI-based features to more locales [#25034] 26.5 ----- diff --git a/WordPress/Classes/Utility/BuildInformation/FeatureFlag.swift b/WordPress/Classes/Utility/BuildInformation/FeatureFlag.swift index 9f344b213e5f..16055bf9c4fc 100644 --- a/WordPress/Classes/Utility/BuildInformation/FeatureFlag.swift +++ b/WordPress/Classes/Utility/BuildInformation/FeatureFlag.swift @@ -1,5 +1,6 @@ import BuildSettingsKit import Foundation +import FoundationModels /// FeatureFlag exposes a series of features to be conditionally enabled on /// different builds. @@ -80,8 +81,10 @@ public enum FeatureFlag: Int, CaseIterable { case .newStats: return false case .intelligence: - let languageCode = Locale.current.language.languageCode?.identifier - return (languageCode ?? "en").hasPrefix("en") + guard #available(iOS 26, *) else { + return false + } + return SystemLanguageModel.default.supportsLocale() case .newSupport: return false case .nativeBlockInserter: diff --git a/WordPress/Classes/ViewRelated/NewSupport/SupportDataProvider.swift b/WordPress/Classes/ViewRelated/NewSupport/SupportDataProvider.swift index c136bda4175d..3cf82c08ebe3 100644 --- a/WordPress/Classes/ViewRelated/NewSupport/SupportDataProvider.swift +++ b/WordPress/Classes/ViewRelated/NewSupport/SupportDataProvider.swift @@ -9,6 +9,7 @@ import WordPressCore import WordPressCoreProtocols import WordPressData import WordPressShared +import WordPressIntelligence import CocoaLumberjack extension SupportDataProvider { @@ -510,7 +511,7 @@ extension SupportAttachment { fileprivate func summarize(_ text: String) async -> String { if #available(iOS 26.0, *) { do { - return try await IntelligenceService().summarizeSupportTicket(content: text) + return try await SupportTicketSummaryGenerator.execute(content: text) } catch { return text } diff --git a/WordPress/Classes/ViewRelated/Post/PostSettings/Services/TagSuggestionsService.swift b/WordPress/Classes/ViewRelated/Post/PostSettings/Services/TagSuggestionsService.swift index 50cc71242302..b0ce6989f274 100644 --- a/WordPress/Classes/ViewRelated/Post/PostSettings/Services/TagSuggestionsService.swift +++ b/WordPress/Classes/ViewRelated/Post/PostSettings/Services/TagSuggestionsService.swift @@ -1,6 +1,7 @@ import Foundation import WordPressData import WordPressShared +import WordPressIntelligence @MainActor final class TagSuggestionsService { @@ -31,7 +32,7 @@ final class TagSuggestionsService { try Task.checkCancellation() - return try await IntelligenceService().suggestTags( + return try await TagSuggestionGenerator().generate( post: postContent, siteTags: siteTags, postTags: postTags diff --git a/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsExcerptEditor.swift b/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsExcerptEditor.swift index 668818775e53..0954e2db2b85 100644 --- a/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsExcerptEditor.swift +++ b/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsExcerptEditor.swift @@ -1,5 +1,6 @@ import SwiftUI import WordPressUI +import WordPressIntelligence import WordPressShared import DesignSystem @@ -42,7 +43,7 @@ struct PostSettingsExcerptEditor: View { .navigationBarTitleDisplayMode(.inline) .toolbar { ToolbarItem(placement: .topBarTrailing) { - if FeatureFlag.intelligence.enabled && !postContent.isEmpty && LanguageModelHelper.isSupported { + if FeatureFlag.intelligence.enabled && !postContent.isEmpty && IntelligenceService.isSupported { if #available(iOS 26, *) { PostSettingsGenerateExcerptButton( content: postContent, diff --git a/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsGenerateExcerptView.swift b/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsGenerateExcerptView.swift index a5f6f72b2be4..00caed22d196 100644 --- a/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsGenerateExcerptView.swift +++ b/WordPress/Classes/ViewRelated/Post/PostSettings/Views/Excerpt/PostSettingsGenerateExcerptView.swift @@ -2,6 +2,8 @@ import SwiftUI import WordPressUI import DesignSystem import FoundationModels +import WordPressShared +import WordPressIntelligence @available(iOS 26, *) struct PostSettingsGenerateExcerptView: View { @@ -11,10 +13,10 @@ struct PostSettingsGenerateExcerptView: View { @Environment(\.dismiss) private var dismiss @AppStorage("jetpack_ai_generated_excerpt_style") - private var style: GenerationStyle = .engaging + private var style: WritingStyle = .engaging @AppStorage("jetpack_ai_generated_excerpt_length") - private var length: GeneratedContentLength = .medium + private var length: ContentLength = .medium @State private var results: [ExcerptGenerationResult.PartiallyGenerated] = [] @State private var isGenerating = false @@ -162,9 +164,9 @@ struct PostSettingsGenerateExcerptView: View { Slider( value: Binding( get: { Double(length.rawValue) }, - set: { length = GeneratedContentLength(rawValue: Int($0)) ?? .medium } + set: { length = ContentLength(rawValue: Int($0)) ?? .medium } ), - in: 0...Double(GeneratedContentLength.allCases.count - 1), + in: 0...Double(ContentLength.allCases.count - 1), step: 1 ) { Text(Strings.lengthSliderAccessibilityLabel) @@ -199,7 +201,7 @@ struct PostSettingsGenerateExcerptView: View { Spacer(minLength: 8) Picker(Strings.stylePickerAccessibilityLabel, selection: $style) { - ForEach(GenerationStyle.allCases, id: \.self) { style in + ForEach(WritingStyle.allCases, id: \.self) { style in Text(style.displayName) .tag(style) } @@ -230,10 +232,8 @@ struct PostSettingsGenerateExcerptView: View { generationTask = Task { do { - let session = LanguageModelSession( - model: .init(guardrails: .permissiveContentTransformations), - instructions: LanguageModelHelper.generateExcerptInstructions - ) + let generator = PostExcerptGenerator(length: length, style: style) + let session = generator.makeSession() self.session = session try await actuallyGenerateExcerpts(in: session) } catch { @@ -273,8 +273,9 @@ struct PostSettingsGenerateExcerptView: View { isGenerating = false } - let content = IntelligenceService().extractRelevantText(from: postContent) - let prompt = isLoadMore ? LanguageModelHelper.generateMoreOptionsPrompt : LanguageModelHelper.makeGenerateExcerptPrompt(content: content, length: length, style: style) + let generator = PostExcerptGenerator(length: length, style: style) + let content = IntelligenceService.extractRelevantText(from: postContent) + let prompt = isLoadMore ? PostExcerptGenerator.loadMorePrompt : generator.makePrompt(content: content) let stream = session.streamResponse(to: prompt, generating: ExcerptGenerationResult.self) for try await result in stream { @@ -299,7 +300,7 @@ struct PostSettingsGenerateExcerptView: View { WPAnalytics.track(.intelligenceExcerptOptionsGenerated, properties: [ "length": length.trackingName, "style": style.rawValue, - "load_more": isLoadMore ? 1 : 0 + "load_more": isLoadMore ]) } } diff --git a/WordPress/Classes/ViewRelated/Reader/Controllers/ReaderPostActions/ReaderPostMenu.swift b/WordPress/Classes/ViewRelated/Reader/Controllers/ReaderPostActions/ReaderPostMenu.swift index eaa61addf0ea..12a8043b361a 100644 --- a/WordPress/Classes/ViewRelated/Reader/Controllers/ReaderPostActions/ReaderPostMenu.swift +++ b/WordPress/Classes/ViewRelated/Reader/Controllers/ReaderPostActions/ReaderPostMenu.swift @@ -4,6 +4,7 @@ import SafariServices import SwiftUI import WordPressData import WordPressShared +import WordPressIntelligence struct ReaderPostMenu { let post: ReaderPost diff --git a/WordPress/Classes/ViewRelated/Reader/Views/ReaderSummarizePostView.swift b/WordPress/Classes/ViewRelated/Reader/Views/ReaderSummarizePostView.swift index 7cf11dcdb604..800cda202956 100644 --- a/WordPress/Classes/ViewRelated/Reader/Views/ReaderSummarizePostView.swift +++ b/WordPress/Classes/ViewRelated/Reader/Views/ReaderSummarizePostView.swift @@ -1,7 +1,7 @@ import SwiftUI import WordPressUI import WordPressData -import FoundationModels +import WordPressIntelligence @available(iOS 26, *) struct ReaderSummarizePostView: View { @@ -72,13 +72,11 @@ struct ReaderSummarizePostView: View { do { let content = post.content ?? "" - let stream = await IntelligenceService().summarizePost(content: content) + let result = try await PostSummaryGenerator().generate(content: content) - for try await result in stream { - guard !Task.isCancelled else { return } - withAnimation(.smooth) { - summary = result.content - } + guard !Task.isCancelled else { return } + withAnimation(.smooth) { + summary = result } } catch { guard !Task.isCancelled else { return }