From 481f0b43bec2cc0374edf43d0a1736af1ebfc73a Mon Sep 17 00:00:00 2001 From: Albina Starykova Date: Tue, 18 Jun 2024 17:03:17 +0300 Subject: [PATCH] Add language-specific reading time calculation --- client/src/includes/contentMetrics.test.ts | 66 +++++++++----------- client/src/includes/contentMetrics.ts | 72 ++++++++++++---------- tsconfig.json | 2 +- 3 files changed, 70 insertions(+), 70 deletions(-) diff --git a/client/src/includes/contentMetrics.test.ts b/client/src/includes/contentMetrics.test.ts index 9b54215c9f61..bc9a2d871158 100644 --- a/client/src/includes/contentMetrics.test.ts +++ b/client/src/includes/contentMetrics.test.ts @@ -1,41 +1,35 @@ -import { getContentMetrics } from './contentMetrics'; +import { getWordCount, getReadingTime } from './contentMetrics'; -describe('getContentMetrics', () => { - it('should return correct wordCount and readingTime using Intl.Segmenter', () => { - const result = getContentMetrics('en-US', 'This is a test sentence.'); - expect(result.wordCount).toBe(5); - expect(result.readingTime).toBe(0); - }); - - it('should handle empty text', () => { - const result = getContentMetrics('en-US', ''); - expect(result.wordCount).toBe(0); - expect(result.readingTime).toBe(0); - }); - - it('should handle text with punctuation correctly', () => { - const text = `This is a longer text to test the word count and reading time calculation! - Bread is a staple food prepared from a dough of flour and water; usually by baking. - Throughout recorded history it has been popular around the world and is one of the - oldest artificial foods: having been of importance since the dawn of agriculture. - Proportions of types of flour and other ingredients vary widely? as do modes of preparation. - As a result, types, shapes, sizes, and textures of breads differ around the world. - Bread may be leavened by processes such as reliance on naturally occurring sourdough - microbes, chemicals, industrially produced yeast, or high-pressure aeration... - Some breads are baked before they have a chance to rise, often for traditional - or religious reasons. Inclusions like fruits; nuts, and fats are sometimes added. - Commercial bread typically includes additives to enhance flavor, texture, color, - longevity, and production efficiency! - `; - const result = getContentMetrics('en-US', text); - expect(result.wordCount).toBe(148); - expect(result.readingTime).toBe(1); +describe.each` + text | lang | wordCount + ${'¿Donde esta la biblioteca?'} | ${'es'} | ${4} + ${"It's lots. Of; Punctuation"} | ${'en'} | ${4} + ${'האהבה היא אוקיינוס שאין לו התחלה ואין לו סוף.'} | ${'he'} | ${9} + ${'元気です、ありがとう。あなたは?'} | ${'zh'} | ${5} + ${'Dit is een testzin in het Nederlands.'} | ${'nl'} | ${7} + ${'Je suis content de te voir!'} | ${'fr'} | ${6} + ${'Ich liebe dich!'} | ${'de'} | ${3} + ${'Mi piace molto questo libro.'} | ${'it'} | ${5} + ${'저는 오늘 날씨가 좋아요.'} | ${'ko'} | ${4} +`('getWordCount', ({ text, lang, wordCount }) => { + test(`correctly counts words in '${text}' for language '${lang}'`, () => { + expect(getWordCount(lang, text)).toBe(wordCount); }); +}); - it('should return integers for wordCount and readingTime', () => { - const text = 'Yet another text'; - const result = getContentMetrics('en-US', text); - expect(Number.isInteger(result.wordCount)).toBe(true); - expect(Number.isInteger(result.readingTime)).toBe(true); +describe.each` + lang | wordCount | readingTime + ${'es'} | ${1000} | ${4} + ${'fr'} | ${1000} | ${5} + ${'ar'} | ${360} | ${2} + ${'it'} | ${360} | ${1} + ${'en'} | ${238} | ${1} + ${'he'} | ${224} | ${1} + ${'zh'} | ${520} | ${2} + ${'nl'} | ${320} | ${1} + ${'ko'} | ${50} | ${0} +`('getReadingTime', ({ lang, wordCount, readingTime }) => { + test(`calculates reading time for '${wordCount}' words in language '${lang}'`, () => { + expect(getReadingTime(lang, wordCount)).toBe(readingTime); }); }); diff --git a/client/src/includes/contentMetrics.ts b/client/src/includes/contentMetrics.ts index adf0f8b3630c..1d4dad66c930 100644 --- a/client/src/includes/contentMetrics.ts +++ b/client/src/includes/contentMetrics.ts @@ -3,40 +3,44 @@ interface ContentMetrics { readingTime: number; } -interface SegmentData { - segment: string; - isWordLike?: boolean | undefined; -} +export const getWordCount = (lang: string, text: string): number => { + const segmenter = new Intl.Segmenter(lang, { granularity: 'word' }); + const segments: Intl.SegmentData[] = Array.from(segmenter.segment(text)); + const wordCount = segments.reduce( + (count, segment) => (segment.isWordLike ? count + 1 : count), + 0, + ); -export const getContentMetrics = ( - lang: string, - text: string, -): ContentMetrics => { - let wordCount = 0; + return wordCount; +}; - if (typeof Intl.Segmenter === 'function') { - const segmenter = new Intl.Segmenter(lang, { granularity: 'word' }); - const segments: SegmentData[] = Array.from(segmenter.segment(text)); - wordCount = segments.reduce( - (count, segment) => (segment.isWordLike ? count + 1 : count), - 0, - ); - } else { - // Fallback to regex if Intl.Segmenter is not supported - wordCount = - text - .trim() - .replace(/['";:,.?¿\-!¡]+/g, '') - .match(/\S+/g)?.length || 0; - } +/* +Language-specific reading speeds according to a meta-analysis of 190 studies on reading rates. +Study preprint: https://osf.io/preprints/psyarxiv/xynwg/ +DOI: https://doi.org/10.1016/j.jml.2019.104047 + */ +const readingSpeeds = { + ar: 181, // Arabic + zh: 260, // Chinese + nl: 228, // Dutch + en: 238, // English + fi: 195, // Finnish + fr: 214, // French + de: 260, // German + he: 224, // Hebrew + it: 285, // Italian + ko: 226, // Korean + es: 278, // Spanish + sv: 218, // Swedish +}; - // Silent-reading adults average 238 words per minute - const readingTime = Math.round(wordCount / 238); +export const getReadingTime = (lang: string, wordCount: number): number => { + const locale = lang.split('-')[0]; + // Fallback to English reading speed if the locale is not found + const readingSpeed = readingSpeeds[locale] || readingSpeeds.en; + const readingTime = Math.round(wordCount / readingSpeed); - return { - wordCount, - readingTime, - }; + return readingTime; }; const renderContentMetrics = ({ wordCount, readingTime }: ContentMetrics) => { @@ -79,8 +83,10 @@ export const runContentCheck = () => { if (!iframe || !iframeDocument || !text) { return; } - const lang = iframeDocument.documentElement.lang || 'en-US'; - const contentMetrics = getContentMetrics(lang, text); + const lang = iframeDocument.documentElement.lang || 'en'; + + const wordCount = getWordCount(lang, text); + const readingTime = getReadingTime(lang, wordCount); - renderContentMetrics(contentMetrics); + renderContentMetrics({ wordCount, readingTime }); }; diff --git a/tsconfig.json b/tsconfig.json index e27a60c91d2a..5d41e591870a 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -5,7 +5,7 @@ "esModuleInterop": true, "forceConsistentCasingInFileNames": true, "jsx": "react", - "lib": ["ES2022", "DOM", "DOM.iterable"], + "lib": ["ES2022", "ES2022.Intl", "DOM", "DOM.iterable"], "moduleResolution": "node", "noImplicitAny": false, // TODO: Enable once all existing code is typed "noUnusedLocals": true,