diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..a077979 --- /dev/null +++ b/Gemfile @@ -0,0 +1,3 @@ +source 'https://rubygems.org' + +gem 'nokogiri' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..5df21ba --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,12 @@ +GEM + remote: https://rubygems.org/ + specs: + mini_portile (0.5.2) + nokogiri (1.6.1) + mini_portile (~> 0.5.0) + +PLATFORMS + ruby + +DEPENDENCIES + nokogiri diff --git a/lib/xml_parser.rb b/lib/xml_parser.rb new file mode 100644 index 0000000..b69edc7 --- /dev/null +++ b/lib/xml_parser.rb @@ -0,0 +1,40 @@ +require 'nokogiri' +require 'open-uri' + +class XmlParser + + attr_reader :doc, :speech_count + + def initialize(url_or_path) + @doc = Nokogiri::XML(open(url_or_path)) + @speech_count = {} + end + + def count_lines(element) + element.xpath(".//LINE").count + end + + def get_speakers(element) + element.xpath(".//SPEAKER").map(&:text) + end + + def parse_speeches + @doc.xpath("//SPEECH").each do |speech| + get_speakers(speech).each do |speaker| + @speech_count[speaker] ||= 0 + @speech_count[speaker] += count_lines(speech) + end + end + end + + def sort_speech_count + (@speech_count.sort_by {|k,v| v}).reverse + end + + def print_result + sort_speech_count.each do |result| + puts "#{result[1]} #{result[0]}" + end + end + +end \ No newline at end of file diff --git a/result.txt b/result.txt new file mode 100644 index 0000000..2fbbca0 --- /dev/null +++ b/result.txt @@ -0,0 +1,41 @@ +719 MACBETH +265 LADY MACBETH +212 MALCOLM +180 MACDUFF +135 ROSS +113 BANQUO +74 LENNOX +70 DUNCAN +62 First Witch +46 Porter +45 Doctor +41 LADY MACDUFF +39 HECATE +35 Sergeant +30 First Murderer +30 SIWARD +27 Third Witch +27 Second Witch +24 ALL +23 Gentlewoman +23 Messenger +21 Lord +21 ANGUS +20 Son +15 Second Murderer +12 MENTEITH +11 Old Man +11 CAITHNESS +10 DONALBAIN +8 Third Murderer +7 YOUNG SIWARD +5 Third Apparition +5 SEYTON +5 Servant +4 Second Apparition +3 Lords +2 First Apparition +2 FLEANCE +2 Both Murderers +1 ATTENDANT +1 Soldiers \ No newline at end of file diff --git a/shakespeare_analyzer.rb b/shakespeare_analyzer.rb new file mode 100644 index 0000000..8248eb6 --- /dev/null +++ b/shakespeare_analyzer.rb @@ -0,0 +1,5 @@ +require_relative 'lib/xml_parser' + +xml_doc = XmlParser.new("http://www.ibiblio.org/xml/examples/shakespeare/macbeth.xml") +xml_doc.parse_speeches +xml_doc.print_result \ No newline at end of file diff --git a/spec/fixtures/sample.xml b/spec/fixtures/sample.xml new file mode 100644 index 0000000..2808851 --- /dev/null +++ b/spec/fixtures/sample.xml @@ -0,0 +1,110 @@ + +The Tragedy of Macbeth + +

+Text placed in the public domain by Moby Lexical Tools, 1992. +

+

SGML markup by Jon Bosak, 1992-1994.

+

XML version by Jon Bosak, 1996-1998.

+

+This work may be freely copied and distributed worldwide. +

+
+ +Dramatis Personae +DUNCAN, king of Scotland. + +MALCOLM +DONALBAIN +his sons. + + +MACBETH +BANQUO +generals of the king's army. + + +MACDUFF +LENNOX +ROSS +MENTEITH +ANGUS +CAITHNESS +noblemen of Scotland. + +FLEANCE, son to Banquo. + +SIWARD, Earl of Northumberland, general of the English forces. + +YOUNG SIWARD, his son. +SEYTON, an officer attending on Macbeth. +Boy, son to Macduff. +An English Doctor. +A Scotch Doctor. +A Soldier. +A Porter. +An Old Man. +LADY MACBETH +LADY MACDUFF +Gentlewoman attending on Lady Macbeth. +HECATE +Three Witches. +Apparitions. + +Lords, Gentlemen, Officers, Soldiers, Murderers, Attendants, and Messengers. + + +SCENE Scotland: England. +MACBETH + +ACT I + +SCENE I. A desert place. +Thunder and lightning. Enter three Witches + +First Witch +When shall we three meet again +In thunder, lightning, or in rain? + + +Second Witch +When the hurlyburly's done, +When the battle's lost and won. + + +Third Witch +That will be ere the set of sun. + + +First Witch +Where the place? + + +Second Witch +Upon the heath. + + +Third Witch +There to meet with Macbeth. + + +First Witch +I come, Graymalkin! + + +Second Witch +Paddock calls. + + +Third Witch +Anon. + + +ALL +Fair is foul, and foul is fair: +Hover through the fog and filthy air. + +Exeunt + + +
\ No newline at end of file diff --git a/spec/xml_parser_spec.rb b/spec/xml_parser_spec.rb new file mode 100644 index 0000000..c4da674 --- /dev/null +++ b/spec/xml_parser_spec.rb @@ -0,0 +1,76 @@ +require_relative '../lib/xml_parser' +require 'rspec' +require 'webmock/rspec' + +describe XmlParser do + + let(:xml_doc) { File.dirname(__FILE__) + '/fixtures/sample.xml' } + + describe '.new' do + it "should call Nokogiri::XML open to parse remote XML file" do + parsed = XmlParser.new(xml_doc) + parsed.doc.should_not be_nil + parsed.doc.is_a?(Nokogiri::XML::Document).should be_true + end + + it "should initialize speech_count instance variable with empty hash" do + parsed = XmlParser.new(xml_doc) + parsed.speech_count.should == {} + end + end + + describe '#count_lines' do + it "should count # of LINE children given the XML element" do + element = double() + node_set = double() + element.should_receive(:xpath).with(".//LINE").and_return(node_set) + node_set.should_receive(:count).and_return(5) + + parsed = XmlParser.new(xml_doc) + count = parsed.count_lines(element) + count.should == 5 + end + end + + describe '#get_speakers' do + it "should return speakers in an array in the given element" do + element = double() + node_set = double() + element.should_receive(:xpath).with(".//SPEAKER").and_return([node_set, node_set]) + node_set.should_receive(:text).twice.and_return("blah") + + parsed = XmlParser.new(xml_doc) + parsed.get_speakers(element).should == ["blah", "blah"] + end + end + + describe '#parse_speeches' do + it "should get nodes with SPEECH name and iterate through them" do + parsed = XmlParser.new(xml_doc) + parsed.parse_speeches + parsed.speech_count.should_not == {} + end + end + + describe '#sort_speech_count' do + it "should returned speech_count hash sorted by speech count" do + parsed = XmlParser.new(xml_doc) + parsed.parse_speeches + result = parsed.sort_speech_count + result[0][1].should > result[-1][1] + end + end + + describe "#print_result" do + it "should print results" do + STDOUT.should_receive(:puts).with("543 Macbeth") + STDOUT.should_receive(:puts).with("345 Banquo") + STDOUT.should_receive(:puts).with("220 Duncan") + + parsed = XmlParser.new(xml_doc) + parsed.should_receive(:sort_speech_count).and_return([['Macbeth', 543], ['Banquo', 345], ['Duncan',220]]) + parsed.print_result + end + end + +end \ No newline at end of file