diff --git a/Gemfile b/Gemfile index bf748f0183..8b25414561 100644 --- a/Gemfile +++ b/Gemfile @@ -167,6 +167,9 @@ gem 'aws-sdk-s3', require: false gem 'azure-storage', require: false gem 'google-cloud-storage', '~> 1.45', require: false +# Storage content analyzers +gem 'excel_analyzer', path: 'gems/excel_analyzer' + group :test do gem 'fivemat', '~> 1.3.7' gem 'webmock', '~> 3.19.1' diff --git a/Gemfile.lock b/Gemfile.lock index 710e002303..bbc3907d13 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -51,6 +51,14 @@ PATH mime-types (< 4.0.0) rails (~> 7.0.4) +PATH + remote: gems/excel_analyzer + specs: + excel_analyzer (0.0.1) + activestorage + nokogiri + rubyzip + GEM remote: https://rubygems.org/ specs: @@ -575,6 +583,7 @@ DEPENDENCIES capybara (~> 3.39.2) charlock_holmes (~> 0.7.7) dalli (~> 3.2.6) + excel_analyzer! exception_notification (~> 4.5.0) factory_bot_rails (~> 6.4.2) fancybox-rails (~> 0.3.0) diff --git a/doc/CHANGES.md b/doc/CHANGES.md index e67fc5ac5a..8198d14c80 100644 --- a/doc/CHANGES.md +++ b/doc/CHANGES.md @@ -2,6 +2,8 @@ ## Highlighted Features +* Add XSLX spreadsheet analyser to automatically detect hidden data (Helen + Cross, Graeme Porteous) * Update attachment processing to automatically rebuild if cached file goes missing (Graeme Porteous) * Allow `InfoRequest` to be categorised (Graeme Porteous) diff --git a/gems/excel_analyzer/.gitignore b/gems/excel_analyzer/.gitignore new file mode 100644 index 0000000000..a85f3fae79 --- /dev/null +++ b/gems/excel_analyzer/.gitignore @@ -0,0 +1 @@ +.rspec_status diff --git a/gems/excel_analyzer/Gemfile b/gems/excel_analyzer/Gemfile new file mode 100644 index 0000000000..68e9bd0ef0 --- /dev/null +++ b/gems/excel_analyzer/Gemfile @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +# Specify your gem's dependencies in excel_analyzer.gemspec +gemspec diff --git a/gems/excel_analyzer/README.md b/gems/excel_analyzer/README.md new file mode 100644 index 0000000000..f321fba0a1 --- /dev/null +++ b/gems/excel_analyzer/README.md @@ -0,0 +1,16 @@ +# ExcelAnalyzer + +This gem packages an analyzer to inspects `XLSX` files (uploaded and analyzed as +`ActiveStorage::Blob` objects) for hidden data, adding the results to the blob +metadata. + +## Development + +After checking out the repo, run `bin/setup` to install dependencies. Then, run +`rake spec` to run the tests. You can also run `bin/console` for an interactive +prompt that will allow you to experiment. + +## Contributing + +Bug reports and pull requests are welcome on GitHub at +https://github.com/mysociety/alaveteli diff --git a/gems/excel_analyzer/Rakefile b/gems/excel_analyzer/Rakefile new file mode 100644 index 0000000000..b6ae734104 --- /dev/null +++ b/gems/excel_analyzer/Rakefile @@ -0,0 +1,8 @@ +# frozen_string_literal: true + +require "bundler/gem_tasks" +require "rspec/core/rake_task" + +RSpec::Core::RakeTask.new(:spec) + +task default: :spec diff --git a/gems/excel_analyzer/bin/console b/gems/excel_analyzer/bin/console new file mode 100755 index 0000000000..d8b31c8448 --- /dev/null +++ b/gems/excel_analyzer/bin/console @@ -0,0 +1,15 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "excel_analyzer" + +# You can add fixtures and/or initialization code here to make experimenting +# with your gem easier. You can also use a different console, if you like. + +# (If you use this, don't forget to add pry to your Gemfile!) +# require "pry" +# Pry.start + +require "irb" +IRB.start(__FILE__) diff --git a/gems/excel_analyzer/bin/setup b/gems/excel_analyzer/bin/setup new file mode 100755 index 0000000000..dce67d860a --- /dev/null +++ b/gems/excel_analyzer/bin/setup @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' +set -vx + +bundle install + +# Do any other automated setup that you need to do here diff --git a/gems/excel_analyzer/excel_analyzer.gemspec b/gems/excel_analyzer/excel_analyzer.gemspec new file mode 100644 index 0000000000..3ac499a707 --- /dev/null +++ b/gems/excel_analyzer/excel_analyzer.gemspec @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +lib = File.expand_path('lib', __dir__) +$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) + +Gem::Specification.new do |spec| + spec.name = "excel_analyzer" + spec.version = "0.0.1" + spec.authors = ["mySociety"] + spec.email = ["alaveteli@mysociety.org"] + + spec.summary = "File analysers for ActiveStorage" + spec.description = "Extra ActiveStorage Analysers for Alaveteli" + spec.homepage = "https://alaveteli.org" + spec.required_ruby_version = ">= 3.0.0" + + spec.metadata["homepage_uri"] = spec.homepage + + spec.files = Dir.chdir(__dir__) do + `git ls-files -z`.split("\x0").reject do |f| + (File.expand_path(f) == __FILE__) || f.start_with?(*%w[bin/ spec/ .git]) + end + end + spec.require_paths = ["lib"] + + spec.add_dependency "activestorage" + spec.add_dependency "nokogiri" + spec.add_dependency "rubyzip" + + spec.add_development_dependency "bundler" + spec.add_development_dependency "pry" + spec.add_development_dependency "rake" + spec.add_development_dependency "rspec" +end diff --git a/gems/excel_analyzer/lib/excel_analyzer.rb b/gems/excel_analyzer/lib/excel_analyzer.rb new file mode 100644 index 0000000000..92cb43e5cb --- /dev/null +++ b/gems/excel_analyzer/lib/excel_analyzer.rb @@ -0,0 +1,2 @@ +require "excel_analyzer/analyzer" +require "excel_analyzer/railtie" if defined?(Rails) diff --git a/gems/excel_analyzer/lib/excel_analyzer/analyzer.rb b/gems/excel_analyzer/lib/excel_analyzer/analyzer.rb new file mode 100644 index 0000000000..3ef3d11669 --- /dev/null +++ b/gems/excel_analyzer/lib/excel_analyzer/analyzer.rb @@ -0,0 +1,80 @@ +require "active_storage" +require "active_storage/analyzer" +require "nokogiri" +require "zip" + +module ExcelAnalyzer + ## + # The Analyzer class is responsible for analyzing Excel files uploaded through + # Active Storage. It checks for various features within the Excel file such as + # hidden rows, columns, sheets, pivot caches, and external links. + # + # The class uses rubyzip and Nokogiri for reading and parsing the contents of + # the Excel (.xlsx) files. + # + class Analyzer < ActiveStorage::Analyzer + XLSX_CONTENT_TYPE = + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + + def self.accept?(blob) + blob.content_type == XLSX_CONTENT_TYPE + end + + def metadata + { excel: excel_metadata } + end + + private + + def excel_metadata + download_blob_to_tempfile(&method(:probe)) + end + + def probe(tempfile) + Zip::File.open(tempfile.path) do |zip_file| + { + pivot_cache: zip_file.glob("xl/pivotCache/*").any?, + external_links: zip_file.glob("xl/externalLinks/*").any?, + hidden_rows: hidden_rows?(zip_file), + hidden_columns: hidden_columns?(zip_file), + hidden_sheets: hidden_sheets?(zip_file) + } + end + + rescue StandardError => ex + { error: ex.message } + end + + def namespace + { "ns" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main" } + end + + def hidden?(object) + object.attr("hidden") == "true" || + object.attr("hidden") == "1" || + object.attr("state") == "hidden" + end + + def hidden_rows?(zip_file) + zip_file.glob("xl/worksheets/*.xml").any? do |worksheet_file| + doc = Nokogiri::XML(worksheet_file.get_input_stream.read) + doc.xpath("//ns:row", namespace).any?(&method(:hidden?)) + end + end + + def hidden_columns?(zip_file) + zip_file.glob("xl/worksheets/*.xml").any? do |worksheet_file| + doc = Nokogiri::XML(worksheet_file.get_input_stream.read) + doc.xpath("//ns:col", namespace).any?(&method(:hidden?)) + end + end + + def hidden_sheets?(zip_file) + workbook_file = zip_file.glob("xl/workbook.xml").first + return false unless workbook_file + + doc = Nokogiri::XML(workbook_file.get_input_stream.read) + doc.xpath("//ns:sheet", namespace).any?(&method(:hidden?)) + end + end +end diff --git a/gems/excel_analyzer/lib/excel_analyzer/railtie.rb b/gems/excel_analyzer/lib/excel_analyzer/railtie.rb new file mode 100644 index 0000000000..6c92d905d9 --- /dev/null +++ b/gems/excel_analyzer/lib/excel_analyzer/railtie.rb @@ -0,0 +1,12 @@ +require "rails" +require "active_storage" + +module ExcelAnalyzer + ## + # This Railtie integrates the gem with Rails by extending ActiveStorage's + # Analyzers with the custom ExcelAnalyzer::Analyzer. + # + class Railtie < Rails::Railtie + config.active_storage.analyzers.prepend ExcelAnalyzer::Analyzer + end +end diff --git a/gems/excel_analyzer/spec/excel_analyzer/analyzer_spec.rb b/gems/excel_analyzer/spec/excel_analyzer/analyzer_spec.rb new file mode 100644 index 0000000000..bdce8360b9 --- /dev/null +++ b/gems/excel_analyzer/spec/excel_analyzer/analyzer_spec.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +require "spec_helper" + +RSpec.describe ExcelAnalyzer::Analyzer do + describe ".accept?" do + subject { ExcelAnalyzer::Analyzer.accept?(blob) } + + context "when the blob is an Excel file" do + let(:blob) do + fake_blob(content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE) + end + + it { is_expected.to eq true } + end + + context "when the blob is not an Excel file" do + let(:blob) { fake_blob(content_type: "text/plain") } + it { is_expected.to eq false } + end + end + + describe "#metadata" do + let(:metadata) { ExcelAnalyzer::Analyzer.new(blob).metadata } + + context "when the blob is an Excel file with hidden data" do + let(:blob) do + fake_blob(io: File.open(File.join(__dir__, "../fixtures/suspect.xlsx")), + content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE) + end + + it "detects pivot cache" do + expect(metadata[:excel][:pivot_cache]).to eq true + end + + it "detects external links" do + expect(metadata[:excel][:external_links]).to eq true + end + + it "detects hidden rows" do + expect(metadata[:excel][:hidden_rows]).to eq true + end + + it "detects hidden columns" do + expect(metadata[:excel][:hidden_columns]).to eq true + end + + it "detects hidden sheets" do + expect(metadata[:excel][:hidden_sheets]).to eq true + end + end + + context "when the blob is an Excel file without hidden data" do + let(:blob) do + fake_blob(io: File.open(File.join(__dir__, "../fixtures/data.xlsx")), + content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE) + end + + it "does not detect hidden data" do + expect(metadata[:excel]).to eq( + pivot_cache: false, + external_links: false, + hidden_rows: false, + hidden_columns: false, + hidden_sheets: false + ) + end + end + + context "when the blob is not an Excel file" do + let(:blob) do + fake_blob(io: File.open(File.join(__dir__, "../fixtures/plain.txt")), + content_type: "text/plain") + end + + it "returns an error metadata" do + expect(metadata[:excel]).to eq( + error: "Zip end of central directory signature not found" + ) + end + end + end + + private + + def fake_blob(io: nil, content_type:) + dbl = double(content_type: content_type) + allow(dbl).to receive(:open).and_yield(io) + dbl + end +end diff --git a/gems/excel_analyzer/spec/fixtures/data.xlsx b/gems/excel_analyzer/spec/fixtures/data.xlsx new file mode 100644 index 0000000000..132ff364f8 Binary files /dev/null and b/gems/excel_analyzer/spec/fixtures/data.xlsx differ diff --git a/gems/excel_analyzer/spec/fixtures/plain.txt b/gems/excel_analyzer/spec/fixtures/plain.txt new file mode 100644 index 0000000000..a496efee84 --- /dev/null +++ b/gems/excel_analyzer/spec/fixtures/plain.txt @@ -0,0 +1 @@ +This is a text file diff --git a/gems/excel_analyzer/spec/fixtures/suspect.xlsx b/gems/excel_analyzer/spec/fixtures/suspect.xlsx new file mode 100644 index 0000000000..d2e01fcc30 Binary files /dev/null and b/gems/excel_analyzer/spec/fixtures/suspect.xlsx differ diff --git a/gems/excel_analyzer/spec/spec_helper.rb b/gems/excel_analyzer/spec/spec_helper.rb new file mode 100644 index 0000000000..5fe1b720c1 --- /dev/null +++ b/gems/excel_analyzer/spec/spec_helper.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +require "bundler/setup" +require "excel_analyzer/analyzer" + +RSpec.configure do |config| + # Enable flags like --only-failures and --next-failure + config.example_status_persistence_file_path = ".rspec_status" + + # Disable RSpec exposing methods globally on `Module` and `main` + config.disable_monkey_patching! + + config.expect_with :rspec do |c| + c.syntax = :expect + end +end