From 64f4f30df5096bc3cb594496dee96fc4ff5e355d Mon Sep 17 00:00:00 2001 From: solresol Date: Mon, 10 Nov 2025 07:57:19 +1100 Subject: [PATCH] Make log proportion chart use log rank and surface top tags --- padjective/build_site.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/padjective/build_site.py b/padjective/build_site.py index 88e4520..f2e76ac 100644 --- a/padjective/build_site.py +++ b/padjective/build_site.py @@ -1755,7 +1755,7 @@ def _generate_log_nonzero_proportion_chart( tag_data.sort(key=lambda x: x[0]) # Calculate cumulative proportion of non-zero coefficients - x_positions = [] + log_rank_positions = [] log_proportions = [] for i in range(len(tag_data)): @@ -1770,20 +1770,25 @@ def _generate_log_nonzero_proportion_chart( # Skip positions where proportion is 0 (can't take log) if proportion > 0: - x_positions.append(i) + current_rank = tag_data[i][0] + # Rank must be positive to take a logarithm. If ranks are 0 or negative, + # fall back to the 1-based index position to maintain monotonic growth. + if current_rank <= 0: + current_rank = i + 1 + log_rank_positions.append(math.log(current_rank)) log_proportions.append(math.log(proportion)) - if len(x_positions) < 2: + if len(log_rank_positions) < 2: return None # Create line chart fig, ax = plt.subplots(figsize=(12, 6)) - ax.plot(x_positions, log_proportions, color='#0b6ce3', linewidth=2, alpha=0.8) + ax.plot(log_rank_positions, log_proportions, color='#0b6ce3', linewidth=2, alpha=0.8) - ax.set_xlabel('Tag Position (ordered by battle rank)', fontsize=12, fontweight='bold') + ax.set_xlabel('log(Tag Rank)', fontsize=12, fontweight='bold') ax.set_ylabel('log(Proportion of Non-Zero Coefficients)', fontsize=12, fontweight='bold') - ax.set_title(f'Log Proportion of Non-Zero Coefficients by Tag Rank', fontsize=14, fontweight='bold', pad=15) + ax.set_title('Log-Log Proportion of Non-Zero Coefficients by Tag Rank', fontsize=14, fontweight='bold', pad=15) ax.grid(True, alpha=0.3, linestyle='--') # Add horizontal reference line at y=0 (proportion=1, or 100%) @@ -2002,7 +2007,7 @@ def _write_umllr_pages(output_dir: Path, summary: Dict[str, Any], conn=None, sch

Log Proportion of Non-Zero Coefficients

Log proportion of non-zero coefficients -
Logarithm of the cumulative proportion of tags with non-zero coefficients (excludes infinite p-adic valuation). Shows how the proportion of informative tags changes as more tags are included by battle ranking.
+
Logarithms of the cumulative proportion of tags with non-zero coefficients (excludes infinite p-adic valuation) plotted against the logarithm of their battle ranking. Shows how the proportion of informative tags changes as more tags are included by battle ranking.
""" @@ -2243,6 +2248,14 @@ def _build_index_html( if not class_rows: class_rows = 'No taxonomy class data available' + top_tags = taxonomy_summary.get("top_tags", [])[:10] + top_tag_rows = "\n".join( + f"{html.escape(row.get('tag') or '')}{html.escape(row.get('top_taxonomy_path') or 'Unknown')}{row.get('top_weight', 0.0):.4f}{row.get('max_abs_weight', 0.0):.4f}" + for row in top_tags + ) + if not top_tag_rows: + top_tag_rows = 'No tag signal data available' + chart_html = "" if taxonomy_dist_chart_path: chart_rel_path = taxonomy_dist_chart_path.relative_to(output_dir).as_posix() @@ -2261,6 +2274,11 @@ def _build_index_html( Taxonomy IDNamePathSamplesShare {class_rows} +

Tags with strongest signal

+ + + {top_tag_rows} +
TagTop taxonomyWeightMax |weight|
""" html_document = f"""