diff --git a/padjective/build_site.py b/padjective/build_site.py index 7469f9c..648ea26 100644 --- a/padjective/build_site.py +++ b/padjective/build_site.py @@ -2975,6 +2975,283 @@ def _write_umllr_pages(output_dir: Path, summary: Dict[str, Any], conn=None, sch +def _calculate_crossover_point( + model1_stats: Dict[str, float], + model2_stats: Dict[str, float], +) -> Optional[float]: + """Calculate the x-value where two regression lines intersect. + + Returns None if lines are parallel or if crossover is in the past (negative x). + """ + slope1, intercept1 = model1_stats['slope'], model1_stats['intercept'] + slope2, intercept2 = model2_stats['slope'], model2_stats['intercept'] + + # Check if slopes are too similar (parallel lines) + if abs(slope1 - slope2) < 1e-10: + return None + + # Calculate intersection: slope1 * x + intercept1 = slope2 * x + intercept2 + # => x = (intercept2 - intercept1) / (slope1 - slope2) + crossover_x = (intercept2 - intercept1) / (slope1 - slope2) + + # Only return if crossover is in the future (positive x increase) + if crossover_x > 0: + return crossover_x + return None + + +def _calculate_crossover_confidence( + x_data1: list, y_data1: list, + x_data2: list, y_data2: list, + crossover_x: float, + n_bootstrap: int = 1000, +) -> Optional[Dict[str, float]]: + """Calculate confidence interval for crossover point using bootstrap. + + Returns dict with 'lower_ci', 'upper_ci', and 'std_err' or None if insufficient data. + """ + # Filter out None values + valid_pairs1 = [(x, y) for x, y in zip(x_data1, y_data1) if y is not None] + valid_pairs2 = [(x, y) for x, y in zip(x_data2, y_data2) if y is not None] + + if len(valid_pairs1) < 2 or len(valid_pairs2) < 2: + return None + + x1, y1 = zip(*valid_pairs1) + x2, y2 = zip(*valid_pairs2) + x1, y1 = np.array(x1), np.array(y1) + x2, y2 = np.array(x2), np.array(y2) + + crossovers = [] + rng = np.random.RandomState(42) # Fixed seed for reproducibility + + for _ in range(n_bootstrap): + # Bootstrap sample for model 1 + idx1 = rng.choice(len(x1), size=len(x1), replace=True) + result1 = stats.linregress(x1[idx1], y1[idx1]) + + # Bootstrap sample for model 2 + idx2 = rng.choice(len(x2), size=len(x2), replace=True) + result2 = stats.linregress(x2[idx2], y2[idx2]) + + # Calculate crossover for this bootstrap sample + if abs(result1.slope - result2.slope) > 1e-10: + cross_x = (result2.intercept - result1.intercept) / (result1.slope - result2.slope) + if cross_x > 0: # Only include positive crossovers + crossovers.append(cross_x) + + if len(crossovers) < 10: # Need enough valid samples + return None + + crossovers = np.array(crossovers) + return { + 'lower_ci': np.percentile(crossovers, 2.5), + 'upper_ci': np.percentile(crossovers, 97.5), + 'std_err': np.std(crossovers), + 'mean': np.mean(crossovers), + } + + +def _estimate_dataset_growth( + dates: list, + values: list, +) -> Optional[Dict[str, float]]: + """Estimate growth rate and predict future values based on historical data. + + Returns dict with 'daily_growth', 'r_squared', and prediction parameters. + """ + # Filter out None values + valid_pairs = [(d, v) for d, v in zip(dates, values) if v is not None and d is not None] + if len(valid_pairs) < 2: + return None + + dates_valid, values_valid = zip(*valid_pairs) + + # Convert dates to days since first measurement + first_date = dates_valid[0] + days_since_start = np.array([(d - first_date).days for d in dates_valid]) + values_arr = np.array(values_valid) + + # Linear regression on values vs days + result = stats.linregress(days_since_start, values_arr) + + return { + 'daily_growth': result.slope, + 'intercept': result.intercept, + 'r_squared': result.rvalue ** 2, + 'p_value': result.pvalue, + 'first_date': first_date, + 'last_value': values_valid[-1], + 'last_days': days_since_start[-1], + } + + +def _predict_date_for_value( + growth_stats: Dict[str, float], + target_value: float, +) -> Optional[datetime]: + """Predict when dataset will reach target value based on growth stats.""" + if growth_stats['daily_growth'] <= 0: + return None + + # Solve: target_value = slope * days + intercept + # => days = (target_value - intercept) / slope + days_needed = (target_value - growth_stats['intercept']) / growth_stats['daily_growth'] + + if days_needed < growth_stats['last_days']: + # Target already achieved + return None + + # Calculate date + from datetime import timedelta + target_date = growth_stats['first_date'] + timedelta(days=int(days_needed)) + return target_date + + +def _format_extrapolation_analysis_html( + regression_stats: Dict[str, Dict[str, float]], + x_data_dict: Dict[str, list], + y_data_dict: Dict[str, list], + dates: Optional[list], + x_values: list, + x_label: str, + dataset_type: str, # "products" or "tags" +) -> str: + """Format extrapolation analysis showing when UMLLR will outperform other models.""" + if 'umllr' not in regression_stats: + return "" + + umllr_stats = regression_stats['umllr'] + + # Models to compare against - unconstrained models and other baselines + comparison_models = { + 'unn': ('UNN (Unconstrained Neural Networks)', '#ec4899'), + 'ulr': ('ULR (Unconstrained Logistic Regression)', '#8b5cf6'), + 'dt': ('Decision Tree', '#14b8a6'), + } + + extrapolations = [] + + for model_key, (model_name, color) in comparison_models.items(): + if model_key not in regression_stats: + continue + + model_stats = regression_stats[model_key] + + # Check if UMLLR is improving faster (more negative slope) + if umllr_stats['slope'] >= model_stats['slope']: + # UMLLR is not improving faster, won't catch up + continue + + # Calculate crossover point + crossover_x = _calculate_crossover_point(umllr_stats, model_stats) + if crossover_x is None: + continue + + # Calculate confidence interval + umllr_x = x_data_dict.get('umllr', []) + umllr_y = y_data_dict.get('umllr', []) + model_x = x_data_dict.get(model_key, []) + model_y = y_data_dict.get(model_key, []) + + confidence = _calculate_crossover_confidence( + umllr_x, umllr_y, + model_x, model_y, + crossover_x + ) + + # Estimate when dataset will reach this size + date_prediction = None + if dates and x_values: + growth_stats = _estimate_dataset_growth(dates, x_values) + if growth_stats: + date_prediction = _predict_date_for_value(growth_stats, crossover_x) + + extrapolations.append({ + 'model_name': model_name, + 'model_key': model_key, + 'color': color, + 'crossover_x': crossover_x, + 'confidence': confidence, + 'date_prediction': date_prediction, + 'growth_stats': growth_stats if dates and x_values else None, + }) + + if not extrapolations: + return "" + + # Build HTML + rows = [] + for ext in extrapolations: + crossover_str = f"{ext['crossover_x']:,.0f}" + + if ext['confidence']: + conf = ext['confidence'] + confidence_str = ( + f"{conf['lower_ci']:,.0f} - {conf['upper_ci']:,.0f} " + f"(95% CI, σ={conf['std_err']:,.0f})" + ) + # Calculate probability based on how far current max is from crossover + current_max = max(x_values) if x_values else 0 + if ext['confidence']['mean'] > current_max: + # Simple probability estimate: if crossover CI doesn't include current value + prob_str = ">95%" if conf['lower_ci'] > current_max else "~50-95%" + else: + prob_str = "Already achieved" + else: + confidence_str = "N/A (insufficient data)" + prob_str = "N/A" + + if ext['date_prediction']: + date_str = ext['date_prediction'].strftime('%Y-%m-%d') + if ext['growth_stats']: + growth_rate = ext['growth_stats']['daily_growth'] + r2 = ext['growth_stats']['r_squared'] + date_str += f" (±uncertain, R²={r2:.3f}, growth={growth_rate:.1f}/{x_label}/day)" + else: + date_str = "N/A (already achieved or negative growth)" + + rows.append( + f'' + f'{ext["model_name"]}' + f'{crossover_str}' + f'{confidence_str}' + f'{prob_str}' + f'{date_str}' + f'' + ) + + return f""" +
+

Extrapolation Analysis: When Will Importance-Optimised p-adic LR Outperform Other Models?

+

+ Based on current regression trends, we can extrapolate when Importance-Optimised p-adic LR + will achieve better performance (lower p-adic loss) than other models as the dataset grows. + The confidence intervals are calculated using bootstrap resampling (n=1000). +

+ + + + + + + + + + + + {''.join(rows)} + +
ModelCrossover Point
({x_label}s)
95% Confidence IntervalProbabilityEstimated Date
+

+ Statistical Notes: The crossover points are calculated by finding where the + regression lines intersect. The 95% confidence intervals are derived from bootstrap resampling + of the regression parameters. The probability estimates indicate the likelihood that the crossover + will occur given the current trends. Date predictions are based on linear extrapolation of dataset + growth and should be interpreted with caution. +

+
""" + def _format_regression_stats_html(stats: Optional[Dict[str, Dict[str, float]]], x_label: str) -> str: """Format regression statistics as an HTML table.""" @@ -3058,6 +3335,16 @@ def _build_trends_section( perf_vs_tags_stats: Optional[Dict[str, Dict[str, float]]] = None, params_vs_loss_stats: Optional[Dict[str, Dict[str, float]]] = None, unconstrained_log_stats: Optional[Dict[str, Any]] = None, + products_x_data: Optional[Dict[str, list]] = None, + products_y_data: Optional[Dict[str, list]] = None, + products_dates: Optional[list] = None, + products_x_values: Optional[list] = None, + tags_x_data: Optional[Dict[str, list]] = None, + tags_y_data: Optional[Dict[str, list]] = None, + tags_dates: Optional[list] = None, + tags_x_values: Optional[list] = None, + products_trajectory_path: Optional[Path] = None, + tags_trajectory_path: Optional[Path] = None, ) -> str: """Build HTML section for historical trends charts.""" if not trends_chart_path: @@ -3070,21 +3357,71 @@ def _build_trends_section( if perf_vs_products_chart_path: products_chart_rel = perf_vs_products_chart_path.relative_to(output_dir).as_posix() products_stats_html = _format_regression_stats_html(perf_vs_products_stats, "product") + + # Add extrapolation analysis + products_extrapolation_html = "" + if perf_vs_products_stats and products_x_data and products_y_data: + products_extrapolation_html = _format_extrapolation_analysis_html( + perf_vs_products_stats, + products_x_data, + products_y_data, + products_dates, + products_x_values, + "product", + "products" + ) + + # Add trajectory chart + products_trajectory_html = "" + if products_trajectory_path: + trajectory_rel = products_trajectory_path.relative_to(output_dir).as_posix() + products_trajectory_html = f""" +
+ Model performance trajectory vs number of products +
""" + perf_vs_products_html = f"""
Model performance vs number of products
- {products_stats_html}""" + {products_stats_html} + {products_extrapolation_html} + {products_trajectory_html}""" perf_vs_tags_html = "" if perf_vs_tags_chart_path: tags_chart_rel = perf_vs_tags_chart_path.relative_to(output_dir).as_posix() tags_stats_html = _format_regression_stats_html(perf_vs_tags_stats, "tag") + + # Add extrapolation analysis + tags_extrapolation_html = "" + if perf_vs_tags_stats and tags_x_data and tags_y_data: + tags_extrapolation_html = _format_extrapolation_analysis_html( + perf_vs_tags_stats, + tags_x_data, + tags_y_data, + tags_dates, + tags_x_values, + "tag", + "tags" + ) + + # Add trajectory chart + tags_trajectory_html = "" + if tags_trajectory_path: + trajectory_rel = tags_trajectory_path.relative_to(output_dir).as_posix() + tags_trajectory_html = f""" +
+ Model performance trajectory vs number of distinct tags +
""" + perf_vs_tags_html = f"""
Model performance vs number of distinct tags
- {tags_stats_html}""" + {tags_stats_html} + {tags_extrapolation_html} + {tags_trajectory_html}""" params_vs_loss_html = "" if params_vs_loss_chart_path: @@ -4020,7 +4357,7 @@ def _build_index_html( {taxonomy_overview_html} - {_build_trends_section(trends_chart_path, perf_vs_products_chart_path, perf_vs_tags_chart_path, params_vs_loss_chart_path, unconstrained_log_chart_path, output_dir, perf_vs_products_stats, perf_vs_tags_stats, params_vs_loss_stats, unconstrained_log_stats)} + {_build_trends_section(trends_chart_path, perf_vs_products_chart_path, perf_vs_tags_chart_path, params_vs_loss_chart_path, unconstrained_log_chart_path, output_dir, perf_vs_products_stats, perf_vs_tags_stats, params_vs_loss_stats, unconstrained_log_stats, products_x_data, products_y_data, products_dates, products_x_values, tags_x_data, tags_y_data, tags_dates, tags_x_values, products_trajectory_path, tags_trajectory_path)}