Skip to content

Commit 88fb0a1

Browse files
authored
Merge pull request #21866 from Yoast/html-parser/paragraph-length
Convert Sentence length and paragraph length to use HTML parser and enable AI button for both assessments
2 parents 870b6c8 + 9a74b44 commit 88fb0a1

File tree

30 files changed

+775
-649
lines changed

30 files changed

+775
-649
lines changed

packages/yoastseo/spec/fullTextTests/testTexts/el/greekPaper.html

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,3 @@
1-
<!DOCTYPE html>
2-
<html lang="el">
3-
<head>
4-
<meta charset="UTF-8">
5-
<title>Ελληνική γλώσσα - Βικιπαίδεια</title>
6-
</head>
7-
<body>
81
<p>Η <a href="/wiki/%CE%A6%CF%89%CE%BD%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1" title="Φωνολογία">φωνολογία</a>, η <a href="/wiki/%CE%9C%CE%BF%CF%81%CF%86%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1_(%CE%B3%CE%BB%CF%89%CF%83%CF%83%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1)" title="Μορφολογία (γλωσσολογία)">μορφολογία</a>, η <a href="/wiki/%CE%A3%CF%8D%CE%BD%CF%84%CE%B1%CE%BE%CE%B7_(%CE%B3%CE%BB%CF%89%CF%83%CF%83%CE%BF%CE%BB%CE%BF%CE%B3%CE%AF%CE%B1)" title="Σύνταξη (γλωσσολογία)">σύνταξη</a> και το <a href="/wiki/%CE%9B%CE%B5%CE%BE%CE%B9%CE%BB%CF%8C%CE%B3%CE%B9%CE%BF" title="Λεξιλόγιο">λεξιλόγιο</a> της γλώσσας δείχνουν τόσο συντηρητικά όσο και καινοτόμα στοιχεία σε ολόκληρη την ιστορική πορεία της γλώσσας από την αρχαία έως τη σύγχρονη περίοδο. Η διαίρεση σε συμβατικές περιόδους είναι σχετικά αυθαίρετη, ειδικά επειδή σε όλες τις περιόδους ύπαρξης της η αρχαία ελληνική έχει απολαύσει υψηλό κύρος και οι εγγράμματοι άνθρωποι χρησιμοποιούσαν πολλά δάνεια από τα αρχαία ελληνικά.
92
</p>
103
<h3><span id=".CE.A6.CF.89.CE.BD.CE.BF.CE.BB.CE.BF.CE.B3.CE.AF.CE.B1"></span><span class="mw-headline" id="Φωνολογία">Φωνολογία</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=%CE%95%CE%BB%CE%BB%CE%B7%CE%BD%CE%B9%CE%BA%CE%AE_%CE%B3%CE%BB%CF%8E%CF%83%CF%83%CE%B1&amp;veaction=edit&amp;section=8" class="mw-editsection-visualeditor" title="Επεξεργασία ενότητας: Φωνολογία">Επεξεργασία</a><span class="mw-editsection-divider"> | </span><a href="/w/index.php?title=%CE%95%CE%BB%CE%BB%CE%B7%CE%BD%CE%B9%CE%BA%CE%AE_%CE%B3%CE%BB%CF%8E%CF%83%CF%83%CE%B1&amp;action=edit&amp;section=8" title="Επεξεργασία ενότητας: Φωνολογία">επεξεργασία κώδικα</a><span class="mw-editsection-bracket">]</span></span></h3>
@@ -173,7 +166,5 @@ <h3><span id=".CE.95.CE.BB.CE.BB.CE.B7.CE.BD.CE.B9.CE.BA.CF.8C_.CE.B1.CE.BB.CF.8
173166
<p>Τα ελληνικά γράφονται στο ελληνικό αλφάβητο από τον 9ο αιώνα π.Χ. περίπου. Δημιουργήθηκε με την τροποποίηση του <a href="/wiki/%CE%A6%CE%BF%CE%B9%CE%BD%CE%B9%CE%BA%CE%B9%CE%BA%CF%8C_%CE%B1%CE%BB%CF%86%CE%AC%CE%B2%CE%B7%CF%84%CE%BF" title="Φοινικικό αλφάβητο">φοινικικού αλφαβήτου</a>, με την καινοτομία της υιοθέτησης ορισμένων νέων γραμμάτων για την γραφή των φωνηέντων. Η παραλλαγή του αλφαβήτου που χρησιμοποιείται σήμερα είναι ουσιαστικά η ύστερη <a href="/wiki/%CE%99%CF%89%CE%BD%CE%B9%CE%BA%CE%AE_%CE%B4%CE%B9%CE%AC%CE%BB%CE%B5%CE%BA%CF%84%CE%BF%CF%82" title="Ιωνική διάλεκτος">Ιωνική</a> παραλλαγή, η οποία εισήχθη για την γραφή της <a href="/wiki/%CE%91%CF%84%CF%84%CE%B9%CE%BA%CE%AE_%CE%B4%CE%B9%CE%AC%CE%BB%CE%B5%CE%BA%CF%84%CE%BF%CF%82" title="Αττική διάλεκτος">αττικής διαλέκτου</a> το 403 π.Χ. Στην κλασική ελληνική, όπως και στην κλασική λατινική, υπήρχαν μόνο κεφαλαία γράμματα. Τα πεζά ελληνικά γράμματα αναπτύχθηκαν πολύ αργότερα από τους μεσαιωνικούς γραμματείς για να επιτρέψουν ένα ταχύτερο, πιο βολικό τρόπο γραφής με τη χρήση <a href="/wiki/%CE%9C%CE%B5%CE%BB%CE%AC%CE%BD%CE%B7" title="Μελάνη">μελανιού</a> και πένας.
174167
</p><p>Το ελληνικό αλφάβητο αποτελείται από 24 γράμματα, το καθένα με κεφαλαία και πεζά γράμματα. Το <a href="/wiki/%CE%A3%CE%AF%CE%B3%CE%BC%CE%B1" title="Σίγμα">σίγμα</a> έχει μια πρόσθετη πεζή μορφή (ς) που χρησιμοποιείται στο τέλος μιας λέξης:
175168
</p>
176-
</body>
177-
</html>
178169

179170
<!-- "Ελληνική γλώσσα" by Wikipedia (EL) is licensed under CC-BY-SA 2.0 (https://creativecommons.org/licenses/by-sa/2.0/) -->

packages/yoastseo/spec/fullTextTests/testTexts/el/greekPaper.js

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ const expectedResults = {
6666
textLength: {
6767
isApplicable: true,
6868
score: 9,
69-
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 2913 words. Good job!",
69+
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 2910 words. Good job!",
7070
},
7171
externalLinks: {
7272
isApplicable: true,
@@ -117,25 +117,25 @@ const expectedResults = {
117117
},
118118
textParagraphTooLong: {
119119
isApplicable: true,
120-
score: 9,
121-
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
120+
score: 3,
121+
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 3 of the paragraphs contain more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
122122
},
123123
textSentenceLength: {
124124
isApplicable: true,
125-
score: 6,
126-
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 27.8% of the sentences contain more than 20 words, " +
125+
score: 3,
126+
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 30.9% of the sentences contain more than 20 words, " +
127127
"which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
128128
},
129129
textTransitionWords: {
130130
isApplicable: true,
131-
score: 3,
132-
resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 19.6% of the sentences contain" +
131+
score: 6,
132+
resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 20.2% of the sentences contain" +
133133
" transition words, which is not enough. <a href='https://yoa.st/35a' target='_blank'>Use more of them</a>.",
134134
},
135135
passiveVoice: {
136136
isApplicable: true,
137137
score: 3,
138-
resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 25.8% of the sentences contain passive voice, " +
138+
resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 26.6% of the sentences contain passive voice, " +
139139
"which is more than the recommended maximum of 10%. <a href='https://yoa.st/34u' target='_blank'>" +
140140
"Try to use their active counterparts</a>.",
141141
},

packages/yoastseo/spec/fullTextTests/testTexts/en/englishPaperForPerformanceTest.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ const expectedResults = {
115115
},
116116
textParagraphTooLong: {
117117
isApplicable: true,
118-
score: 9,
119-
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
118+
score: 6,
119+
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 1 of the paragraphs contains more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
120120
},
121121
textSentenceLength: {
122122
isApplicable: true,

packages/yoastseo/spec/fullTextTests/testTexts/es/spanishPaperForPerformanceTest.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ const expectedResults = {
115115
},
116116
textParagraphTooLong: {
117117
isApplicable: true,
118-
score: 9,
119-
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
118+
score: 6,
119+
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 1 of the paragraphs contains more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
120120
},
121121
textSentenceLength: {
122122
isApplicable: true,

packages/yoastseo/spec/fullTextTests/testTexts/fa/farsiPaper.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@ const expectedResults = {
118118
},
119119
textParagraphTooLong: {
120120
isApplicable: true,
121-
score: 9,
122-
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
121+
score: 3,
122+
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 3 of the paragraphs contain more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
123123
},
124124
textSentenceLength: {
125125
isApplicable: true,

packages/yoastseo/spec/fullTextTests/testTexts/fr/frenchPaper.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,13 @@ const expectedResults = {
109109
},
110110
textParagraphTooLong: {
111111
isApplicable: true,
112-
score: 9,
113-
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
112+
score: 3,
113+
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 2 of the paragraphs contain more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
114114
},
115115
textSentenceLength: {
116116
isApplicable: true,
117117
score: 3,
118-
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 40.8% of the sentences contain more" +
118+
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 39.6% of the sentences contain more" +
119119
" than 20 words, which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>" +
120120
"Try to shorten the sentences</a>.",
121121
},

packages/yoastseo/spec/fullTextTests/testTexts/he/hebrewPaper.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ const expectedResults = {
120120
textSentenceLength: {
121121
isApplicable: true,
122122
score: 3,
123-
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 66.7% of the sentences contain more than 15 words," +
123+
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 78.9% of the sentences contain more than 15 words," +
124124
" which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
125125
},
126126
textTransitionWords: {

packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ const expectedResults = {
124124
textSentenceLength: {
125125
isApplicable: true,
126126
score: 3,
127-
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 50.8% of the sentences contain more than 40 characters, " +
127+
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 51.6% of the sentences contain more than 40 characters, " +
128128
"which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
129129
},
130130
textTransitionWords: {

packages/yoastseo/spec/fullTextTests/testTexts/pl/polishPaper.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ const expectedResults = {
109109
textSentenceLength: {
110110
isApplicable: true,
111111
score: 3,
112-
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 45.8% of the sentences contain more than 20 words, which is more than the recommended maximum of 15%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
112+
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 46.6% of the sentences contain more than 20 words, which is more than the recommended maximum of 15%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
113113
},
114114
textTransitionWords: {
115115
isApplicable: true,

packages/yoastseo/spec/fullTextTests/testTexts/pl/polishPaperForPerformanceTest.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ const expectedResults = {
109109
},
110110
textParagraphTooLong: {
111111
isApplicable: true,
112-
score: 9,
113-
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: None of the paragraphs are too long. Great job!",
112+
score: 6,
113+
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 1 of the paragraphs contains more than the recommended maximum number of words (150). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
114114
},
115115
textSentenceLength: {
116116
isApplicable: true,

packages/yoastseo/spec/languageProcessing/helpers/html/matchParagraphsSpec.js

Lines changed: 0 additions & 42 deletions
This file was deleted.
Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,81 @@
11
import sentencesLength from "../../../../src/languageProcessing/helpers/sentence/sentencesLength";
2+
import getSentencesFromTree from "../../../../src/languageProcessing/helpers/sentence/getSentencesFromTree";
23
import JapaneseResearcher from "../../../../src/languageProcessing/languages/ja/Researcher";
34
import EnglishResearcher from "../../../../src/languageProcessing/languages/en/Researcher";
45
import Paper from "../../../../src/values/Paper";
6+
import buildTree from "../../../specHelpers/parse/buildTree";
57

68
describe( "A test to count sentence lengths.", function() {
79
it( "should not return a length for an empty sentence", function() {
8-
const sentences = [ "", "A sentence" ];
9-
const mockResearcher = new EnglishResearcher( new Paper( "" ) );
10+
const mockPaper = new Paper( "<p></p><p>A sentence</p>" );
11+
const mockResearcher = new EnglishResearcher( mockPaper );
12+
buildTree( mockPaper, mockResearcher );
1013

11-
const lengths = sentencesLength( sentences, mockResearcher );
14+
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
1215

13-
expect( lengths ).toEqual( [
14-
{ sentence: "A sentence", sentenceLength: 2 },
15-
] );
16+
expect( sentenceLengths.length ).toEqual( 1 );
17+
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 2 );
18+
expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "A sentence" );
1619
} );
1720

1821
it( "should return the sentences and their length (the HTML tags should not be counted if present)", function() {
19-
const sentences = [ "A <strong>good</strong> text", "this is a <span style='color: blue;'> textstring </span>" ];
20-
const mockResearcher = new EnglishResearcher( new Paper( "" ) );
22+
const mockPaper = new Paper( "<p>A <strong>good</strong> text</p>" +
23+
"<p>this is a <span style='color: blue;'>string</span></p>" );
24+
const mockResearcher = new EnglishResearcher( mockPaper );
25+
buildTree( mockPaper, mockResearcher );
2126

22-
const lengths = sentencesLength( sentences, mockResearcher );
27+
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
2328

24-
expect( lengths ).toEqual( [
25-
{ sentence: "A <strong>good</strong> text", sentenceLength: 3 },
26-
{ sentence: "this is a <span style='color: blue;'> textstring </span>", sentenceLength: 4 },
27-
] );
29+
expect( sentenceLengths.length ).toEqual( 2 );
30+
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 3 );
31+
expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "A good text" );
32+
expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 4 );
33+
expect( sentenceLengths[ 1 ].sentence.text ).toEqual( "this is a string" );
34+
} );
35+
36+
it( "should return the correct length for sentences containing hyphens", function() {
37+
const mockPaper = new Paper(
38+
"<p>My know-it-all mother-in-law made a state-of-the-art U-turn.</p>" +
39+
"<p>Her ex-husband found that low-key amazing.</p>" );
40+
const mockResearcher = new EnglishResearcher( mockPaper );
41+
buildTree( mockPaper, mockResearcher );
42+
43+
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
44+
45+
expect( sentenceLengths.length ).toEqual( 2 );
46+
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 7 );
47+
expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 6 );
48+
} );
49+
50+
it( "should return the correct length for sentences containing leading and trailing spaces including the first and last token that is not spaces", function() {
51+
const mockPaper = new Paper(
52+
"<p> The first sentence.</p><p>The second sentence. </p>" );
53+
const mockResearcher = new EnglishResearcher( mockPaper );
54+
buildTree( mockPaper, mockResearcher );
55+
56+
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockResearcher );
57+
58+
expect( sentenceLengths.length ).toEqual( 2 );
59+
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 3 );
60+
expect( sentenceLengths[ 0 ].firstToken ).toEqual( { sourceCodeRange: { endOffset: 7, startOffset: 4 }, text: "The" } );
61+
expect( sentenceLengths[ 0 ].lastToken ).toEqual( { sourceCodeRange: { endOffset: 23, startOffset: 22 }, text: "." } );
62+
expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 3 );
63+
expect( sentenceLengths[ 1 ].firstToken ).toEqual( { sourceCodeRange: { endOffset: 33, startOffset: 30 }, text: "The" } );
64+
expect( sentenceLengths[ 1 ].lastToken ).toEqual( { sourceCodeRange: { endOffset: 50, startOffset: 49 }, text: "." } );
2865
} );
2966

3067
it( "should return the sentences and their length for Japanese (so counting characters)", function() {
31-
const sentences = [ "自然おのずから存在しているもの", "歩くさわやかな森 <span style='color: red;'> 自然 </span>" ];
32-
const mockJapaneseResearcher = new JapaneseResearcher( new Paper( "" ) );
68+
const mockPaper = new Paper( "<p>自然おのずから存在しているもの</p>" +
69+
"<p>歩くさわやかな森 <span style='color: red;'> 自然 </span></p>" );
70+
const mockJapaneseResearcher = new JapaneseResearcher( mockPaper );
71+
buildTree( mockPaper, mockJapaneseResearcher );
3372

34-
const lengths = sentencesLength( sentences, mockJapaneseResearcher );
73+
const sentenceLengths = sentencesLength( getSentencesFromTree( mockPaper ), mockJapaneseResearcher );
3574

36-
expect( lengths ).toEqual( [
37-
{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
38-
{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 10 },
39-
] );
75+
expect( sentenceLengths.length ).toEqual( 2 );
76+
expect( sentenceLengths[ 0 ].sentenceLength ).toEqual( 15 );
77+
expect( sentenceLengths[ 0 ].sentence.text ).toEqual( "自然おのずから存在しているもの" );
78+
expect( sentenceLengths[ 1 ].sentenceLength ).toEqual( 10 );
79+
expect( sentenceLengths[ 1 ].sentence.text ).toEqual( "歩くさわやかな森 自然 " );
4080
} );
4181
} );

0 commit comments

Comments
 (0)