diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5163d9b..c252199 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -76,6 +76,18 @@ jobs: go-version: '1.24' check-latest: true + - name: Check Go formatting + run: | + echo "Checking Go code formatting..." + if [ -n "$(gofmt -l .)" ]; then + echo "❌ The following files need formatting:" + gofmt -l . + echo "" + echo "Run 'make fmt' or 'gofmt -w .' to format them" + exit 1 + fi + echo "✓ All files are properly formatted" + - name: Run golangci-lint uses: golangci/golangci-lint-action@v6 # v6.1.0 with: diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..11f932d --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,87 @@ +run: + timeout: 5m + tests: true + modules-download-mode: readonly + +linters: + enable: + - gofmt # Check that code is formatted (auto-fixable) + - goimports # Check import statements are formatted (auto-fixable) + - govet # Check for common mistakes + - errcheck # Check for unchecked errors + - staticcheck # Advanced static analysis + - ineffassign # Detect ineffectual assignments + - unused # Check for unused code (auto-fixable) + - gosimple # Suggest code simplifications (auto-fixable) + - gocritic # Comprehensive code analyzer + - misspell # Check for misspelled words (auto-fixable) + - godot # Check that comments end in periods (auto-fixable) + - prealloc # Find slice preallocation opportunities + - nolintlint # Check nolint directives + - revive # Fast, configurable linter + - bodyclose # Check HTTP response body is closed + - gci # Sorts imports (auto-fixable) + - whitespace # Checks for whitespace issues (auto-fixable) + +linters-settings: + gofmt: + simplify: true + goimports: + local-prefixes: github.com/vladkampov/markdocify + gci: + sections: + - standard + - default + - prefix(github.com/vladkampov/markdocify) + govet: + enable: + - shadow + errcheck: + check-type-assertions: true + check-blank: true + gocritic: + enabled-tags: + - diagnostic + - performance + - style + disabled-checks: + - commentedOutCode + revive: + severity: warning + rules: + - name: blank-imports + - name: context-as-argument + - name: context-keys-type + - name: dot-imports + - name: error-return + - name: error-strings + - name: error-naming + - name: exported + - name: if-return + - name: increment-decrement + - name: var-naming + - name: var-declaration + - name: package-comments + - name: range + - name: receiver-naming + - name: time-naming + - name: unexported-return + - name: indent-error-flow + - name: errorf + - name: empty-block + - name: superfluous-else + - name: unreachable-code + - name: redefines-builtin-id + +issues: + exclude-rules: + - path: _test\.go + linters: + - errcheck + - ineffassign + - path: cmd/ + linters: + - gochecknoinits + exclude-use-default: false + max-issues-per-linter: 0 + max-same-issues: 0 \ No newline at end of file diff --git a/Makefile b/Makefile index c883b41..d7613ae 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: build test clean install lint security-scan +.PHONY: build test clean install lint security-scan fmt fmt-check check-all fmt-fix # Build configuration BINARY_NAME=markdocify @@ -34,6 +34,53 @@ lint: security-scan: gosec ./... +# Format Go code +fmt: + @echo "Formatting Go code..." + @gofmt -w . + @goimports -w . + @echo "✓ Code formatted" + +# Auto-fix what we can with golangci-lint +fmt-fix: + @echo "Auto-fixing code issues..." + @gofmt -w . + @goimports -w . + @if command -v golangci-lint >/dev/null 2>&1; then \ + echo "Running golangci-lint with auto-fix..."; \ + golangci-lint run --fix; \ + else \ + echo "⚠️ golangci-lint not installed - only running gofmt/goimports"; \ + fi + @echo "✓ Auto-fixes applied" + +# Check if code is formatted +fmt-check: + @echo "Checking Go code formatting..." + @if [ -n "$$(gofmt -l .)" ]; then \ + echo "❌ The following files need formatting:"; \ + gofmt -l .; \ + echo "Run 'make fmt' to format them"; \ + exit 1; \ + else \ + echo "✓ All files are properly formatted"; \ + fi + +# Run all checks locally (same as CI) +check-all: fmt-check + @echo "Running all local checks..." + @if command -v golangci-lint >/dev/null 2>&1; then \ + echo "Running linter..."; \ + golangci-lint run; \ + else \ + echo "⚠️ golangci-lint not installed. Install with:"; \ + echo " brew install golangci-lint"; \ + echo " OR: go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest"; \ + fi + @echo "Running tests..." + @go test ./... + @echo "✅ All checks passed!" + # Clean build artifacts clean: rm -rf bin/ diff --git a/cmd/markdocify/main.go b/cmd/markdocify/main.go index 8e69b3c..fc2b459 100644 --- a/cmd/markdocify/main.go +++ b/cmd/markdocify/main.go @@ -107,15 +107,15 @@ func createQuickConfig(inputURL string) (*config.Config, error) { // Create comprehensive domain patterns for following links domain := parsedURL.Hostname() basePattern := strings.ReplaceAll(domain, ".", "\\.") - + // Much more aggressive following patterns for comprehensive documentation coverage followPatterns := []string{ fmt.Sprintf("^https?://%s/.*", basePattern), // Main domain pattern } - + // Add specific documentation path patterns based on the starting URL if strings.Contains(inputURL, "/docs") { - followPatterns = append(followPatterns, + followPatterns = append(followPatterns, fmt.Sprintf("^https?://%s/docs/.*", basePattern), fmt.Sprintf("^https?://%s/documentation/.*", basePattern), fmt.Sprintf("^https?://%s/guide/.*", basePattern), @@ -135,7 +135,7 @@ func createQuickConfig(inputURL string) (*config.Config, error) { BaseURL: fmt.Sprintf("%s://%s", parsedURL.Scheme, parsedURL.Host), OutputFile: outputFile, StartURLs: []string{inputURL}, - + FollowPatterns: followPatterns, IgnorePatterns: []string{ // Media files @@ -178,7 +178,7 @@ func createQuickConfig(inputURL string) (*config.Config, error) { }, Selectors: config.SelectorConfig{ - Title: "h1, title, .page-title, .doc-title, [data-testid='page-title']", + Title: "h1, title, .page-title, .doc-title, [data-testid='page-title']", Content: strings.Join([]string{ // Primary content containers "main", "article", ".content", ".documentation", ".docs", "#content", ".main-content", @@ -288,4 +288,4 @@ func main() { fmt.Fprintf(os.Stderr, "Error: %v\n", err) os.Exit(1) } -} \ No newline at end of file +} diff --git a/cmd/markdocify/main_test.go b/cmd/markdocify/main_test.go index 49cdc53..50c1ddf 100644 --- a/cmd/markdocify/main_test.go +++ b/cmd/markdocify/main_test.go @@ -17,7 +17,7 @@ func TestVersion(t *testing.T) { err := rootCmd.Execute() require.NoError(t, err) }) - + assert.Contains(t, output, "markdocify version") } @@ -28,7 +28,7 @@ func TestHelp(t *testing.T) { err := rootCmd.Execute() require.NoError(t, err) }) - + assert.Contains(t, output, "markdocify is a CLI tool") assert.Contains(t, output, "Usage:") } @@ -61,44 +61,44 @@ func TestRunScraperErrors(t *testing.T) { assert.Error(t, err) assert.Contains(t, err.Error(), "either provide a URL") }) - + t.Run("invalid URL", func(t *testing.T) { err := runScraper(rootCmd, []string{"not-a-valid-url"}) assert.Error(t, err) assert.Contains(t, err.Error(), "invalid") }) - + t.Run("invalid config file", func(t *testing.T) { // Save original value origConfigFile := configFile defer func() { configFile = origConfigFile }() - + // Set non-existent config file configFile = "/tmp/non-existent-config.yml" - + err := runScraper(rootCmd, []string{}) assert.Error(t, err) assert.Contains(t, err.Error(), "failed to load config") }) - + t.Run("malformed config file", func(t *testing.T) { // Create temporary malformed config file tmpFile, err := os.CreateTemp("", "bad-config-*.yml") require.NoError(t, err) defer os.Remove(tmpFile.Name()) - + // Write invalid YAML _, err = tmpFile.WriteString("invalid: yaml: content: [\n") require.NoError(t, err) tmpFile.Close() - + // Save original value origConfigFile := configFile defer func() { configFile = origConfigFile }() - + // Set malformed config file configFile = tmpFile.Name() - + err = runScraper(rootCmd, []string{}) assert.Error(t, err) assert.Contains(t, err.Error(), "failed to load config") @@ -110,7 +110,7 @@ func TestRunScraperWithConfig(t *testing.T) { tmpFile, err := os.CreateTemp("", "test-config-*.yml") require.NoError(t, err) defer os.Remove(tmpFile.Name()) - + configContent := ` name: "Test Documentation" base_url: "https://example.com" @@ -124,14 +124,14 @@ processing: _, err = tmpFile.WriteString(configContent) require.NoError(t, err) tmpFile.Close() - + // Save original value origConfigFile := configFile defer func() { configFile = origConfigFile }() - + // Set config file flag configFile = tmpFile.Name() - + // Test with config file (this will fail during scraping since example.com/docs doesn't exist) err = runScraper(rootCmd, []string{}) assert.Error(t, err) // Expected since we're scraping a non-existent URL @@ -143,10 +143,10 @@ func TestRunScraperWithOutputFlag(t *testing.T) { // Save original values origOutputFile := outputFile defer func() { outputFile = origOutputFile }() - + // Test URL with output flag outputFile = "/tmp/test-output.md" - + // This should succeed as httpbin.org/html is actually accessible err := runScraper(rootCmd, []string{"https://httpbin.org/html"}) // httpbin.org/html usually works, so we don't expect an error @@ -166,17 +166,17 @@ func TestCreateQuickConfig(t *testing.T) { maxDepth = origMaxDepth concurrency = origConcurrency }() - + // Set test values outputFile = "" maxDepth = 5 concurrency = 2 - + testURL := "https://example.com/docs" - + cfg, err := createQuickConfig(testURL) require.NoError(t, err) - + assert.Equal(t, "Example.com Documentation", cfg.Name) assert.Equal(t, "https://example.com", cfg.BaseURL) assert.Equal(t, "example-com-docs.md", cfg.OutputFile) @@ -186,22 +186,22 @@ func TestCreateQuickConfig(t *testing.T) { assert.True(t, cfg.Processing.PreserveCodeBlocks) assert.True(t, cfg.Processing.GenerateTOC) assert.True(t, cfg.Processing.SanitizeHTML) - + // Test invalid URL _, err = createQuickConfig("not-a-valid-url") assert.Error(t, err) assert.Contains(t, err.Error(), "invalid") - + // Test empty URL _, err = createQuickConfig("") assert.Error(t, err) assert.Contains(t, err.Error(), "invalid") - + // Test URL with different schemes cfg, err = createQuickConfig("http://example.com/docs") require.NoError(t, err) assert.Equal(t, "http://example.com", cfg.BaseURL) - + // Test URL with custom output file outputFile = "custom-output.md" cfg, err = createQuickConfig("https://test.com/api") @@ -218,7 +218,7 @@ func TestMain(t *testing.T) { {"version command", []string{"markdocify", "--version"}}, {"invalid command", []string{"markdocify", "invalid-url-without-scheme"}}, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Test that main doesn't panic @@ -227,29 +227,29 @@ func TestMain(t *testing.T) { t.Errorf("main() panicked with args %v: %v", tt.args, r) } }() - + // Save original args and restore them after test oldArgs := os.Args defer func() { os.Args = oldArgs }() - + // Set test args os.Args = tt.args - + // Capture output to avoid printing to test output old := os.Stdout oldStderr := os.Stderr r, w, _ := os.Pipe() os.Stdout = w os.Stderr = w - + // This should not panic main() - + // Restore stdout/stderr w.Close() os.Stdout = old os.Stderr = oldStderr - + // Read the output (optional, just to consume it) _, _ = io.ReadAll(r) }) @@ -261,17 +261,17 @@ func captureOutput(t *testing.T, fn func()) string { old := os.Stdout r, w, err := os.Pipe() require.NoError(t, err) - + os.Stdout = w - + fn() - + w.Close() os.Stdout = old - + var buf bytes.Buffer _, err = io.Copy(&buf, r) require.NoError(t, err) - + return buf.String() -} \ No newline at end of file +} diff --git a/internal/aggregator/aggregator.go b/internal/aggregator/aggregator.go index a5461ea..613b20e 100644 --- a/internal/aggregator/aggregator.go +++ b/internal/aggregator/aggregator.go @@ -13,8 +13,10 @@ import ( "github.com/vladkampov/markdocify/internal/config" ) +// MaxPagesInMemory defines the threshold for warning about memory usage. const MaxPagesInMemory = 1000 +// Aggregator collects and organizes scraped pages into a single output document. type Aggregator struct { config *config.Config pages []*Page @@ -61,7 +63,7 @@ func (a *Aggregator) AddPage(url, title, content string, depth int) { Depth: depth, Timestamp: time.Now(), } - + a.pages = append(a.pages, page) a.contentHashes[contentHash] = true } @@ -114,13 +116,13 @@ func (a *Aggregator) writeMetadata(output *strings.Builder) { func (a *Aggregator) writeTableOfContents(output *strings.Builder) { output.WriteString("## Table of Contents\n\n") - + for _, page := range a.pages { indent := strings.Repeat(" ", page.Depth) anchor := a.createAnchor(page.Title) output.WriteString(fmt.Sprintf("%s- [%s](#%s)\n", indent, page.Title, anchor)) } - + output.WriteString("\n---\n\n") } @@ -155,13 +157,13 @@ func (a *Aggregator) createAnchor(title string) string { anchor = strings.ReplaceAll(anchor, "<", "") anchor = strings.ReplaceAll(anchor, ">", "") anchor = strings.ReplaceAll(anchor, ",", "") - + for strings.Contains(anchor, "--") { anchor = strings.ReplaceAll(anchor, "--", "-") } - + anchor = strings.Trim(anchor, "-") - + return anchor } @@ -180,7 +182,7 @@ func (a *Aggregator) writeContent(output *strings.Builder) { if headingLevel > 6 { headingLevel = 6 } - + headingPrefix := strings.Repeat("#", headingLevel) output.WriteString(fmt.Sprintf("%s %s\n\n", headingPrefix, pageTitle)) @@ -203,7 +205,7 @@ func (a *Aggregator) extractTitleFromURL(url string) string { if lastPart == "" && len(parts) > 1 { lastPart = parts[len(parts)-2] } - + if lastPart != "" { title := strings.ReplaceAll(lastPart, "-", " ") title = strings.ReplaceAll(title, "_", " ") @@ -211,7 +213,7 @@ func (a *Aggregator) extractTitleFromURL(url string) string { return title } } - + return "Untitled" } @@ -220,7 +222,11 @@ func (a *Aggregator) writeToFile(content string) error { if err != nil { return fmt.Errorf("failed to create output file: %w", err) } - defer file.Close() + defer func() { + if closeErr := file.Close(); closeErr != nil { + err = fmt.Errorf("failed to close file: %w", closeErr) + } + }() _, err = file.WriteString(content) if err != nil { @@ -238,4 +244,4 @@ func titleCase(s string) string { } } return strings.Join(words, " ") -} \ No newline at end of file +} diff --git a/internal/aggregator/aggregator_test.go b/internal/aggregator/aggregator_test.go index 758924c..8e4a5c2 100644 --- a/internal/aggregator/aggregator_test.go +++ b/internal/aggregator/aggregator_test.go @@ -70,15 +70,15 @@ func TestGenerateOutput(t *testing.T) { require.NoError(t, err) contentStr := string(content) - + // Verify metadata assert.Contains(t, contentStr, "# Test Documentation") assert.Contains(t, contentStr, "**Total Pages**: 2") assert.Contains(t, contentStr, "**Max Depth**: 2") - + // Verify TOC assert.Contains(t, contentStr, "## Table of Contents") - + // Verify content assert.Contains(t, contentStr, "# Home") assert.Contains(t, contentStr, "## API") @@ -140,7 +140,7 @@ func TestSortPages(t *testing.T) { assert.Equal(t, 0, agg.pages[1].Depth) assert.Equal(t, 1, agg.pages[2].Depth) assert.Equal(t, 1, agg.pages[3].Depth) - + // Within same depth, should be sorted by URL assert.True(t, agg.pages[0].URL < agg.pages[1].URL) assert.True(t, agg.pages[2].URL < agg.pages[3].URL) @@ -199,4 +199,3 @@ func TestMemoryLimitWarning(t *testing.T) { assert.Equal(t, 5, agg.GetPageCount()) } - diff --git a/internal/config/config.go b/internal/config/config.go index 1dc0626..dda8a10 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -60,20 +60,20 @@ type OutputConfig struct { } type SecurityConfig struct { - RespectRobots bool `yaml:"respect_robots"` - CheckTerms bool `yaml:"check_terms"` - MaxFileSize string `yaml:"max_file_size"` - AllowedDomains []string `yaml:"allowed_domains"` - RequestTimeout time.Duration `yaml:"request_timeout"` - ScrapingTimeout time.Duration `yaml:"scraping_timeout"` - MaxFileSizeBytes int64 + RespectRobots bool `yaml:"respect_robots"` + CheckTerms bool `yaml:"check_terms"` + MaxFileSize string `yaml:"max_file_size"` + AllowedDomains []string `yaml:"allowed_domains"` + RequestTimeout time.Duration `yaml:"request_timeout"` + ScrapingTimeout time.Duration `yaml:"scraping_timeout"` + MaxFileSizeBytes int64 } type MonitoringConfig struct { - EnableMetrics bool `yaml:"enable_metrics"` - LogLevel string `yaml:"log_level"` - ProgressUpdates bool `yaml:"progress_updates"` - MetricsPort int `yaml:"metrics_port"` + EnableMetrics bool `yaml:"enable_metrics"` + LogLevel string `yaml:"log_level"` + ProgressUpdates bool `yaml:"progress_updates"` + MetricsPort int `yaml:"metrics_port"` } func LoadConfig(path string) (*Config, error) { @@ -130,7 +130,7 @@ func (c *Config) SetDefaults() error { } c.Security.MaxFileSizeBytes = maxSize c.Security.RequestTimeout = 30 * time.Second - + // Set default scraping timeout - generous for large documentation sites if c.Security.ScrapingTimeout == 0 { c.Security.ScrapingTimeout = 10 * time.Minute @@ -153,15 +153,15 @@ func (c *Config) Validate() error { if c.Name == "" { return fmt.Errorf("name is required") } - + if err := validateURL(c.BaseURL, "base_url"); err != nil { return err } - + if c.OutputFile == "" { return fmt.Errorf("output_file is required") } - + if len(c.StartURLs) == 0 { return fmt.Errorf("start_urls is required and must contain at least one URL") } @@ -266,4 +266,4 @@ func parseSize(sizeStr string) (int64, error) { } return num * multiplier, nil -} \ No newline at end of file +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 48c188d..bf0e81e 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -313,7 +313,7 @@ func TestSetDefaults_EdgeCases(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { err := tt.config.SetDefaults() - + if tt.expectedError != "" { require.Error(t, err) assert.Contains(t, err.Error(), tt.expectedError) @@ -381,7 +381,7 @@ func TestValidateURL_EdgeCases(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { err := validateURL(tt.url, tt.fieldName) - + if tt.expectError != "" { require.Error(t, err) assert.Contains(t, err.Error(), tt.expectError) @@ -471,4 +471,4 @@ func TestValidate_ComprehensiveCases(t *testing.T) { assert.Contains(t, err.Error(), tt.expectError) }) } -} \ No newline at end of file +} diff --git a/internal/converter/converter.go b/internal/converter/converter.go index b957f65..05c713f 100644 --- a/internal/converter/converter.go +++ b/internal/converter/converter.go @@ -1,3 +1,4 @@ +// Package converter provides HTML to Markdown conversion functionality. package converter import ( @@ -11,13 +12,14 @@ import ( "github.com/vladkampov/markdocify/internal/types" ) +// Converter handles the conversion of HTML content to Markdown format. type Converter struct { - config *config.Config - sanitizer *bluemonday.Policy + config *config.Config + sanitizer *bluemonday.Policy mdConverter *md.Converter } - +// New creates a new Converter instance with the provided configuration. func New(cfg *config.Config) (*Converter, error) { c := &Converter{ config: cfg, @@ -31,20 +33,20 @@ func New(cfg *config.Config) (*Converter, error) { func (c *Converter) createSanitizer() *bluemonday.Policy { p := bluemonday.UGCPolicy() - + p.AllowElements("pre", "code", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6") p.AllowElements("table", "thead", "tbody", "tr", "th", "td") p.AllowElements("ul", "ol", "li", "dl", "dt", "dd") p.AllowElements("p", "br", "hr", "div", "span") p.AllowElements("strong", "b", "em", "i", "u", "s", "del", "ins") p.AllowElements("a").AllowAttrs("href", "title").OnElements("a") - + if c.config.Output.PreserveImages { p.AllowElements("img").AllowAttrs("src", "alt", "title", "width", "height").OnElements("img") } p.AllowAttrs("class").OnElements("pre", "code") - + if !c.config.Output.InlineStyles { p.AllowAttrs("style").OnElements("*") } @@ -54,12 +56,13 @@ func (c *Converter) createSanitizer() *bluemonday.Policy { func (c *Converter) createMarkdownConverter() *md.Converter { converter := md.NewConverter("", true, nil) - + converter.Use(plugin.GitHubFlavored()) - + return converter } +// ConvertToMarkdown converts the HTML content of a page to Markdown format. func (c *Converter) ConvertToMarkdown(page *types.PageContent) (string, error) { if page.Content == "" { return "", fmt.Errorf("no content to convert") @@ -92,7 +95,7 @@ func (c *Converter) postProcessMarkdown(markdown string) string { for _, line := range lines { line = strings.TrimRight(line, " \t") - + if strings.TrimSpace(line) == "" { if len(processedLines) == 0 || processedLines[len(processedLines)-1] != "" { processedLines = append(processedLines, "") @@ -111,10 +114,10 @@ func (c *Converter) postProcessMarkdown(markdown string) string { func (c *Converter) generateMetadata(page *types.PageContent) string { var metadata []string - + metadata = append(metadata, fmt.Sprintf("", page.URL)) metadata = append(metadata, fmt.Sprintf("", page.Title)) metadata = append(metadata, fmt.Sprintf("", page.Depth)) - + return strings.Join(metadata, "\n") -} \ No newline at end of file +} diff --git a/internal/converter/converter_test.go b/internal/converter/converter_test.go index 24742ba..5e5b61f 100644 --- a/internal/converter/converter_test.go +++ b/internal/converter/converter_test.go @@ -44,7 +44,7 @@ func TestCreateSanitizer(t *testing.T) { InlineStyles: false, }, }, - inputHTML: `

Safe content

`, + inputHTML: `

Safe content

`, expectContains: []string{"Safe content"}, expectRemoved: []string{"script", "alert"}, }, @@ -56,7 +56,7 @@ func TestCreateSanitizer(t *testing.T) { InlineStyles: false, }, }, - inputHTML: `test

Content

`, + inputHTML: `test

Content

`, expectContains: []string{"img", "src=\"test.jpg\"", "alt=\"test\""}, }, { @@ -67,7 +67,7 @@ func TestCreateSanitizer(t *testing.T) { InlineStyles: false, }, }, - inputHTML: `
func main() {}
`, + inputHTML: `
func main() {}
`, expectContains: []string{"
", "Title

Paragraph

") require.NoError(t, err) @@ -351,4 +351,4 @@ func TestSanitizer_ComplexHTML(t *testing.T) { // Should remove dangerous elements assert.NotContains(t, result, "