Skip to content

Commit

Permalink
Merge branch 'excel-analyzer' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
gbp committed Dec 13, 2023
2 parents 80ecdc5 + 79b1e62 commit 17a3fcd
Show file tree
Hide file tree
Showing 18 changed files with 304 additions and 0 deletions.
3 changes: 3 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ gem 'aws-sdk-s3', require: false
gem 'azure-storage', require: false
gem 'google-cloud-storage', '~> 1.45', require: false

# Storage content analyzers
gem 'excel_analyzer', path: 'gems/excel_analyzer'

group :test do
gem 'fivemat', '~> 1.3.7'
gem 'webmock', '~> 3.19.1'
Expand Down
9 changes: 9 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ PATH
mime-types (< 4.0.0)
rails (~> 7.0.4)

PATH
remote: gems/excel_analyzer
specs:
excel_analyzer (0.0.1)
activestorage
nokogiri
rubyzip

GEM
remote: https://rubygems.org/
specs:
Expand Down Expand Up @@ -575,6 +583,7 @@ DEPENDENCIES
capybara (~> 3.39.2)
charlock_holmes (~> 0.7.7)
dalli (~> 3.2.6)
excel_analyzer!
exception_notification (~> 4.5.0)
factory_bot_rails (~> 6.4.2)
fancybox-rails (~> 0.3.0)
Expand Down
2 changes: 2 additions & 0 deletions doc/CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## Highlighted Features

* Add XSLX spreadsheet analyser to automatically detect hidden data (Helen
Cross, Graeme Porteous)
* Update attachment processing to automatically rebuild if cached file goes
missing (Graeme Porteous)
* Allow `InfoRequest` to be categorised (Graeme Porteous)
Expand Down
1 change: 1 addition & 0 deletions gems/excel_analyzer/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.rspec_status
6 changes: 6 additions & 0 deletions gems/excel_analyzer/Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# frozen_string_literal: true

source "https://rubygems.org"

# Specify your gem's dependencies in excel_analyzer.gemspec
gemspec
16 changes: 16 additions & 0 deletions gems/excel_analyzer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# ExcelAnalyzer

This gem packages an analyzer to inspects `XLSX` files (uploaded and analyzed as
`ActiveStorage::Blob` objects) for hidden data, adding the results to the blob
metadata.

## Development

After checking out the repo, run `bin/setup` to install dependencies. Then, run
`rake spec` to run the tests. You can also run `bin/console` for an interactive
prompt that will allow you to experiment.

## Contributing

Bug reports and pull requests are welcome on GitHub at
https://github.com/mysociety/alaveteli
8 changes: 8 additions & 0 deletions gems/excel_analyzer/Rakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# frozen_string_literal: true

require "bundler/gem_tasks"
require "rspec/core/rake_task"

RSpec::Core::RakeTask.new(:spec)

task default: :spec
15 changes: 15 additions & 0 deletions gems/excel_analyzer/bin/console
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/setup"
require "excel_analyzer"

# You can add fixtures and/or initialization code here to make experimenting
# with your gem easier. You can also use a different console, if you like.

# (If you use this, don't forget to add pry to your Gemfile!)
# require "pry"
# Pry.start

require "irb"
IRB.start(__FILE__)
8 changes: 8 additions & 0 deletions gems/excel_analyzer/bin/setup
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'
set -vx

bundle install

# Do any other automated setup that you need to do here
34 changes: 34 additions & 0 deletions gems/excel_analyzer/excel_analyzer.gemspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# frozen_string_literal: true

lib = File.expand_path('lib', __dir__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)

Gem::Specification.new do |spec|
spec.name = "excel_analyzer"
spec.version = "0.0.1"
spec.authors = ["mySociety"]
spec.email = ["alaveteli@mysociety.org"]

spec.summary = "File analysers for ActiveStorage"
spec.description = "Extra ActiveStorage Analysers for Alaveteli"
spec.homepage = "https://alaveteli.org"
spec.required_ruby_version = ">= 3.0.0"

spec.metadata["homepage_uri"] = spec.homepage

spec.files = Dir.chdir(__dir__) do
`git ls-files -z`.split("\x0").reject do |f|
(File.expand_path(f) == __FILE__) || f.start_with?(*%w[bin/ spec/ .git])
end
end
spec.require_paths = ["lib"]

spec.add_dependency "activestorage"
spec.add_dependency "nokogiri"
spec.add_dependency "rubyzip"

spec.add_development_dependency "bundler"
spec.add_development_dependency "pry"
spec.add_development_dependency "rake"
spec.add_development_dependency "rspec"
end
2 changes: 2 additions & 0 deletions gems/excel_analyzer/lib/excel_analyzer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
require "excel_analyzer/analyzer"
require "excel_analyzer/railtie" if defined?(Rails)
80 changes: 80 additions & 0 deletions gems/excel_analyzer/lib/excel_analyzer/analyzer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
require "active_storage"
require "active_storage/analyzer"
require "nokogiri"
require "zip"

module ExcelAnalyzer
##
# The Analyzer class is responsible for analyzing Excel files uploaded through
# Active Storage. It checks for various features within the Excel file such as
# hidden rows, columns, sheets, pivot caches, and external links.
#
# The class uses rubyzip and Nokogiri for reading and parsing the contents of
# the Excel (.xlsx) files.
#
class Analyzer < ActiveStorage::Analyzer
XLSX_CONTENT_TYPE =
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

def self.accept?(blob)
blob.content_type == XLSX_CONTENT_TYPE
end

def metadata
{ excel: excel_metadata }
end

private

def excel_metadata
download_blob_to_tempfile(&method(:probe))
end

def probe(tempfile)
Zip::File.open(tempfile.path) do |zip_file|
{
pivot_cache: zip_file.glob("xl/pivotCache/*").any?,
external_links: zip_file.glob("xl/externalLinks/*").any?,
hidden_rows: hidden_rows?(zip_file),
hidden_columns: hidden_columns?(zip_file),
hidden_sheets: hidden_sheets?(zip_file)
}
end

rescue StandardError => ex
{ error: ex.message }
end

def namespace
{ "ns" => "http://schemas.openxmlformats.org/spreadsheetml/2006/main" }
end

def hidden?(object)
object.attr("hidden") == "true" ||
object.attr("hidden") == "1" ||
object.attr("state") == "hidden"
end

def hidden_rows?(zip_file)
zip_file.glob("xl/worksheets/*.xml").any? do |worksheet_file|
doc = Nokogiri::XML(worksheet_file.get_input_stream.read)
doc.xpath("//ns:row", namespace).any?(&method(:hidden?))
end
end

def hidden_columns?(zip_file)
zip_file.glob("xl/worksheets/*.xml").any? do |worksheet_file|
doc = Nokogiri::XML(worksheet_file.get_input_stream.read)
doc.xpath("//ns:col", namespace).any?(&method(:hidden?))
end
end

def hidden_sheets?(zip_file)
workbook_file = zip_file.glob("xl/workbook.xml").first
return false unless workbook_file

doc = Nokogiri::XML(workbook_file.get_input_stream.read)
doc.xpath("//ns:sheet", namespace).any?(&method(:hidden?))
end
end
end
12 changes: 12 additions & 0 deletions gems/excel_analyzer/lib/excel_analyzer/railtie.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
require "rails"
require "active_storage"

module ExcelAnalyzer
##
# This Railtie integrates the gem with Rails by extending ActiveStorage's
# Analyzers with the custom ExcelAnalyzer::Analyzer.
#
class Railtie < Rails::Railtie
config.active_storage.analyzers.prepend ExcelAnalyzer::Analyzer
end
end
91 changes: 91 additions & 0 deletions gems/excel_analyzer/spec/excel_analyzer/analyzer_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# frozen_string_literal: true

require "spec_helper"

RSpec.describe ExcelAnalyzer::Analyzer do
describe ".accept?" do
subject { ExcelAnalyzer::Analyzer.accept?(blob) }

context "when the blob is an Excel file" do
let(:blob) do
fake_blob(content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE)
end

it { is_expected.to eq true }
end

context "when the blob is not an Excel file" do
let(:blob) { fake_blob(content_type: "text/plain") }
it { is_expected.to eq false }
end
end

describe "#metadata" do
let(:metadata) { ExcelAnalyzer::Analyzer.new(blob).metadata }

context "when the blob is an Excel file with hidden data" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/suspect.xlsx")),
content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE)
end

it "detects pivot cache" do
expect(metadata[:excel][:pivot_cache]).to eq true
end

it "detects external links" do
expect(metadata[:excel][:external_links]).to eq true
end

it "detects hidden rows" do
expect(metadata[:excel][:hidden_rows]).to eq true
end

it "detects hidden columns" do
expect(metadata[:excel][:hidden_columns]).to eq true
end

it "detects hidden sheets" do
expect(metadata[:excel][:hidden_sheets]).to eq true
end
end

context "when the blob is an Excel file without hidden data" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/data.xlsx")),
content_type: ExcelAnalyzer::Analyzer::XLSX_CONTENT_TYPE)
end

it "does not detect hidden data" do
expect(metadata[:excel]).to eq(
pivot_cache: false,
external_links: false,
hidden_rows: false,
hidden_columns: false,
hidden_sheets: false
)
end
end

context "when the blob is not an Excel file" do
let(:blob) do
fake_blob(io: File.open(File.join(__dir__, "../fixtures/plain.txt")),
content_type: "text/plain")
end

it "returns an error metadata" do
expect(metadata[:excel]).to eq(
error: "Zip end of central directory signature not found"
)
end
end
end

private

def fake_blob(io: nil, content_type:)
dbl = double(content_type: content_type)
allow(dbl).to receive(:open).and_yield(io)
dbl
end
end
Binary file added gems/excel_analyzer/spec/fixtures/data.xlsx
Binary file not shown.
1 change: 1 addition & 0 deletions gems/excel_analyzer/spec/fixtures/plain.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a text file
Binary file added gems/excel_analyzer/spec/fixtures/suspect.xlsx
Binary file not shown.
16 changes: 16 additions & 0 deletions gems/excel_analyzer/spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# frozen_string_literal: true

require "bundler/setup"
require "excel_analyzer/analyzer"

RSpec.configure do |config|
# Enable flags like --only-failures and --next-failure
config.example_status_persistence_file_path = ".rspec_status"

# Disable RSpec exposing methods globally on `Module` and `main`
config.disable_monkey_patching!

config.expect_with :rspec do |c|
c.syntax = :expect
end
end

0 comments on commit 17a3fcd

Please sign in to comment.