Hi, this is a test.
Hi, this is a test.
Here is the list of food you can give your cat.
" + + "Lorem ipsum dolor sit amet, est minim reprimique et, impetus interpretaris eos ea.
" + + "Aperiri scripserit per cu, at mea graeci numquam.
" + + "Ne vix clita soluta persecuti, vel at fugit labores, mentitum intellegebat ius ex. " + + "Cu semper comprehensam duo, pro fugit animal reprehendunt et.
" + + "Has an natum errem, vix oratio mediocrem an, pro ponderum senserit dignissim ut.
"; + + expect( htmlParser( text ) ).toBe( "Here is the list of food you can give your cat.
" + + "Lorem ipsum dolor sit amet, est minim " + + "reprimique et, impetus interpretaris eos ea.
Aperiri scripserit per cu, at mea graeci numquam.
Ne vix clita soluta persecuti, vel at fugit labores, mentitum intellegebat ius ex. Cu semper comprehensam duo, " + + "pro fugit animal reprehendunt et.
Has an natum errem, vix oratio mediocrem an, pro ponderum senserit dignissim ut.
" ); + } ); +} ); + +describe( "Strips the estimated reading time from the analysis text.", function() { + it( "should return a text without the estimated reading time", function() { + const text = "" + + "" + + "" + + "Estimated reading time: " + + "2 minutes
" + + "For the first time in 70 years, India’s forests will be home to cheetahs.
" + + "Eight of them are set to arrive in August from Namibia, home to one of the world’s largest populations of the wild cat.
" + + "Their return comes decades after India’s indigenous population was declared officially extinct in 1952.
" + + "The world’s fastest land animal, the cheetah can reach speeds of 70 miles (113km) an hour.
"; + expect( htmlParser( text ) ).toEqual( + "For the first time in 70 years, India’s forests will be home to cheetahs.
" + + "Eight of them are set to arrive in August from Namibia, home to one of the world’s largest populations of the wild cat.
" + + "Their return comes decades after India’s indigenous population was declared officially extinct in 1952.
" + + "The world’s fastest land animal, the cheetah can reach speeds of 70 miles (113km) an hour.
" ); + } ); + + it( "should return a text without the estimated reading time, even if additional classes are added to the p element", function() { + const text = "" + + "" + + "" + + "Estimated reading time: " + + "2 minutes
" + + "This test has some more class(es).
"; + expect( htmlParser( text ) ).toEqual( + "This test has some more class(es).
" ); + } ); } ); diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/excludeEstimatedReadingTimeSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/excludeEstimatedReadingTimeSpec.js deleted file mode 100644 index 84fd4486533..00000000000 --- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/excludeEstimatedReadingTimeSpec.js +++ /dev/null @@ -1,31 +0,0 @@ -import excludeEstimatedReadingTime from "../../../../src/languageProcessing/helpers/sanitize/excludeEstimatedReadingTime.js"; - -describe( "Strips the estimated reading time from the analysis text.", function() { - it( "returns a text without the estimated reading time", function() { - const text = "" + - "" + - "" + - "Estimated reading time: " + - "2 minutes
" + - "For the first time in 70 years, India’s forests will be home to cheetahs.
" + - "Eight of them are set to arrive in August from Namibia, home to one of the world’s largest populations of the wild cat.
" + - "Their return comes decades after India’s indigenous population was declared officially extinct in 1952.
" + - "The world’s fastest land animal, the cheetah can reach speeds of 70 miles (113km) an hour.
"; - expect( excludeEstimatedReadingTime( text ) ).toEqual( - "For the first time in 70 years, India’s forests will be home to cheetahs.
" + - "Eight of them are set to arrive in August from Namibia, home to one of the world’s largest populations of the wild cat.
" + - "Their return comes decades after India’s indigenous population was declared officially extinct in 1952.
" + - "The world’s fastest land animal, the cheetah can reach speeds of 70 miles (113km) an hour.
" ); - } ); - - it( "returns a text without the estimated reading time, even if additional classes are added to the p element", function() { - const text = "" + - "" + - "" + - "Estimated reading time: " + - "2 minutes
" + - "This test has some more class(es).
"; - expect( excludeEstimatedReadingTime( text ) ).toEqual( - "This test has some more class(es).
" ); - } ); -} ); diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/excludeTableOfContentsTagSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/excludeTableOfContentsTagSpec.js deleted file mode 100644 index ce27c45075c..00000000000 --- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/excludeTableOfContentsTagSpec.js +++ /dev/null @@ -1,28 +0,0 @@ -import excludeTableOfContentsTag from "../../../../src/languageProcessing/helpers/sanitize/excludeTableOfContentsTag.js"; - -describe( "Strips the table of contents from the text.", function() { - it( "returns a text without the table of contents", function() { - const text = "Here is the list of food you can give your cat.
" + - "Lorem ipsum dolor sit amet, est minim reprimique et, impetus interpretaris eos ea.
" + - "Aperiri scripserit per cu, at mea graeci numquam.
" + - "Ne vix clita soluta persecuti, vel at fugit labores, mentitum intellegebat ius ex. " + - "Cu semper comprehensam duo, pro fugit animal reprehendunt et.
" + - "Has an natum errem, vix oratio mediocrem an, pro ponderum senserit dignissim ut.
"; - - expect( excludeTableOfContentsTag( text ) ).toBe( "Here is the list of food you can give your cat.
" + - "Lorem ipsum dolor sit amet, est minim " + - "reprimique et, impetus interpretaris eos ea.
Aperiri scripserit per cu, at mea graeci numquam.
Ne vix clita soluta persecuti, vel at fugit labores, mentitum intellegebat ius ex. Cu semper comprehensam duo, " + - "pro fugit animal reprehendunt et.
Has an natum errem, vix oratio mediocrem an, pro ponderum senserit dignissim ut.
" ); - } ); -} ); diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/sanitizeStringSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/sanitizeStringSpec.js index dbc9c6014cd..01559148029 100644 --- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/sanitizeStringSpec.js +++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/sanitizeStringSpec.js @@ -10,29 +10,6 @@ describe( "Test for removing unwanted characters.", function() { expect( sanitizeString( "50/50" ) ).toBe( "50/50" ); expect( sanitizeString( "50/50
" ) ).toBe( "50/50" ); } ); - it( "excludes Table of Content from the text", () => { - const text = "Here is the list of food you can give your cat.
" + - "Lorem ipsum dolor sit amet, est minim reprimique et, impetus interpretaris eos ea.
" + - "Aperiri scripserit per cu, at mea graeci numquam.
" + - "Ne vix clita soluta persecuti, vel at fugit labores, mentitum intellegebat ius ex. " + - "Cu semper comprehensam duo, pro fugit animal reprehendunt et.
" + - "Has an natum errem, vix oratio mediocrem an, pro ponderum senserit dignissim ut.
"; - - expect( sanitizeString( text ) ).toEqual( "Here is the list of food you can give your cat. Food that are raw Lorem ipsum dolor sit amet, " + - "est minim reprimique et, impetus interpretaris eos ea. Food from fresh meat Aperiri scripserit per cu, at mea graeci numquam." + - " Food that contains vegetables Ne vix clita soluta persecuti, vel at fugit labores, mentitum intellegebat ius ex. " + - "Cu semper comprehensam duo, pro fugit animal reprehendunt et. Food that are cooked Has an natum errem, vix oratio mediocrem an, " + - "pro ponderum senserit dignissim ut." - ); - } ); it( "unifies whitespaces and non-breaking spaces", () => { const text = "A text\u0020string."; diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js index a601da67476..8132753b4a6 100644 --- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js +++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js @@ -16,27 +16,6 @@ describe( "counts characters in a string", function() { expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " + "(представляващи краен брой знаци)." ) ).toBe( 78 ); } ); - it( "makes sure that the table of contents is excluded from the calculation", function() { - const text = "「どんぐりころころ」は、大正時代に作られた" + - "唱歌、広義の。
大正時代に青木存義によって作られた唱歌集『かはいい唱歌』(共益商社書店)が初出である。発表年は2説ある。" + - "これは初出の『かはいい唱歌 二冊目』の奥付が、初版本とその後の重版本とで異なることに起因する。巷に比較的現存している部数が多い重版本では、「一冊目」と同一日付の" + - "「大正十年十月」発行との記載があり、この1921年(大正10年)10月であるとする説が主流である。もう1説は、初版本に由来する。青木の故郷である松島町では昭和後期から青木の歌" + - "を歌い継ごうとする動きが活発となり、そうした活動を通じて地元の郷土史家らが青木家の関係者から本作が掲載されている「二冊目」を譲り受けた。
" + - "本作品が掲載された『かはいい唱歌』は「幼稚園又は小学校初年級程度」の子どもを対象として作成されている。青木は当時「文部省図書監修官」及び「小学校唱歌教科書編纂委員」" + - "の任にあったものの、この唱歌集は私的に民間の出版社から出したものであり、いわゆる文部省編纂の「\">文部省唱歌」にはあたらない。「一冊目」「二冊目」ともに10編、" + - " 計20編が収録されており、本作品は「二冊目」の第7番目に掲載されている。" + - "作詞は全て青木自身であり、青木の詞に曲をつけた作曲者は計12名、本作品の作曲者である梁田貞は" + - "『兎と狸』と併せ計2曲の作曲を担当している。
" + - "戦後においては一般に広義の童謡にカテゴライズされる本作品は、" + - "初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" + - "本作品を歌うことは原則上はできなかった。
"; - expect( countCharactersFunction( text ) ).toBe( 744 ); - } ); it( "makes sure that no characters are counted when a URL is embedded in video tags", function() { const text = "\n" + diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/getWordsSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/getWordsSpec.js index 775f57b6633..4e04bc36719 100644 --- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/getWordsSpec.js +++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/getWordsSpec.js @@ -21,18 +21,4 @@ describe( "test for getting Japanese segmented words", function() { expect( words ).toEqual( [ "計画", "段階", "で", "は", "東海道", "新線", "と", "呼ば", "れ", "て", "い", "た", "が", "開業", "時", "に", "は", "東海道", "新幹線", "と", "命名", "さ", "れ", "た" ] ); } ); - - it( "excludes Table of Contents from the segmenter and strips html tags", function() { - const words = getWords( "" + - "ベロでは、毛皮の色に基づいて猫の種類を見つけることができます。
" + - "猫が食べることができる食べ物の例は以下にあります。
" ); - - expect( words ).toEqual( [ "猫", "の", "種類", "ベロ", "で", "は", "毛皮", "の", "色", "に", "基づい", "て", "猫", - "の", "種類", "を", "見つける", "こと", "が", "でき", "ます", "キャットフード", "猫", "が", "食べる", "こと", "が", "できる", - "食べ物", "の", "例", "は", "以下", "に", "あり", "ます" ] - ); - } ); } ); diff --git a/packages/yoastseo/spec/parse/build/buildSpec.js b/packages/yoastseo/spec/parse/build/buildSpec.js index e85cf7fe33d..cad0ea56596 100644 --- a/packages/yoastseo/spec/parse/build/buildSpec.js +++ b/packages/yoastseo/spec/parse/build/buildSpec.js @@ -853,17 +853,6 @@ describe( "The parse function", () => { name: "#document-fragment", attributes: {}, childNodes: [ { - name: "p", - isImplicit: true, - attributes: {}, - sentences: [], - childNodes: [], - sourceCodeLocation: { - startOffset: 0, - endOffset: 45, - }, - }, - { name: "p", isImplicit: false, attributes: {}, diff --git a/packages/yoastseo/spec/parse/build/private/filterTreeSpec.js b/packages/yoastseo/spec/parse/build/private/filterTreeSpec.js index f54d00d605f..f530da24680 100644 --- a/packages/yoastseo/spec/parse/build/private/filterTreeSpec.js +++ b/packages/yoastseo/spec/parse/build/private/filterTreeSpec.js @@ -278,6 +278,14 @@ describe( "Miscellaneous tests", () => { const tree = adapt( parseFragment( html, { sourceCodeLocationInfo: true } ) ); expect( tree.findAll( child => child.name === "abbr" ) ).toHaveLength( 0 ); } ); + + it( "should filter out span elements and remove the implicit paragraph it's part of", () => { + const html = 'My cat loves me.'; + const tree = adapt( parseFragment( html, { sourceCodeLocationInfo: true } ) ); + const filteredTree = filterTree( tree, permanentFilters ); + expect( filteredTree.findAll( child => [ "span", "p" ].includes( child.name ) ) ).toHaveLength( 0 ); + } ); + it( "should filter out the Elementor Yoast Breadcrumbs widget ", () => { // When the HTML enters the paper, the Breadcrumbs widget doesn't include the div tag. let html = ").*?(
|
)$" ); */ export default function( text, memoizedTokenizer = defaultSentenceTokenizer ) { // We don't remove the other HTML tags here since removing them might lead to incorrect results when running the sentence tokenizer. - // Remove Table of Contents. - text = excludeTableOfContentsTag( text ); - // Remove Estimated reading time. - text = excludeEstimatedReadingTime( text ); // Unify only non-breaking spaces and not the other whitespaces since a whitespace could signify a sentence break or a new line. text = unifyNonBreakingSpace( text ); /* diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js index 5c2cec887af..180d803370f 100644 --- a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js +++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js @@ -4,8 +4,7 @@ import removeURLs from "../../../helpers/sanitize/removeURLs.js"; /** * Calculates the character count which serves as a measure of text length. - * The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, spaces, and the - * content of the Table of Contents and Estimated Reading Time blocks. + * The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, and spaces. * * @param {string} text The text to be counted. * diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/getWords.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/getWords.js index 69e653883bf..401db21393c 100644 --- a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/getWords.js +++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/getWords.js @@ -13,7 +13,7 @@ const segmenter = new TinySegmenter(); * @returns {Array} The array with all words. */ export default function( text ) { - // Strips HTML tags and exclude Table of Contents from the analysis. + // Strips HTML tags. text = sanitizeString( text ); if ( text === "" ) { return []; diff --git a/packages/yoastseo/src/languageProcessing/researches/getParagraphLength.js b/packages/yoastseo/src/languageProcessing/researches/getParagraphLength.js index a85cb29d00f..f45b223f85e 100644 --- a/packages/yoastseo/src/languageProcessing/researches/getParagraphLength.js +++ b/packages/yoastseo/src/languageProcessing/researches/getParagraphLength.js @@ -1,6 +1,4 @@ import { imageRegex } from "../helpers/image/imageInText"; -import excludeTableOfContentsTag from "../helpers/sanitize/excludeTableOfContentsTag"; -import excludeEstimatedReadingTime from "../helpers/sanitize/excludeEstimatedReadingTime"; import sanitizeLineBreakTag from "../helpers/sanitize/sanitizeLineBreakTag"; import countWords from "../helpers/word/countWords.js"; import matchParagraphs from "../helpers/html/matchParagraphs.js"; @@ -21,9 +19,6 @@ export default function( paper, researcher ) { text = removeHtmlBlocks( text ); text = filterShortcodesFromHTML( text, paper._attributes && paper._attributes.shortcodes ); - text = excludeTableOfContentsTag( text ); - // Exclude the Estimated Reading time text from the research - text = excludeEstimatedReadingTime( text ); // Remove images from text before retrieving the paragraphs. // This step is done here so that applying highlight in captions is possible for ParagraphTooLongAssessment. text = text.replace( imageRegex, "" ); diff --git a/packages/yoastseo/src/languageProcessing/researches/getSubheadingTextLengths.js b/packages/yoastseo/src/languageProcessing/researches/getSubheadingTextLengths.js index 618413675e1..9cc81f4acac 100644 --- a/packages/yoastseo/src/languageProcessing/researches/getSubheadingTextLengths.js +++ b/packages/yoastseo/src/languageProcessing/researches/getSubheadingTextLengths.js @@ -1,5 +1,4 @@ import getSubheadingTexts from "../helpers/html/getSubheadingTexts"; -import excludeTableOfContentsTag from "../helpers/sanitize/excludeTableOfContentsTag"; import countWords from "../helpers/word/countWords"; import { forEach } from "lodash-es"; import removeHtmlBlocks from "../helpers/html/htmlParser"; @@ -17,7 +16,6 @@ export default function( paper, researcher ) { let text = paper.getText(); text = removeHtmlBlocks( text ); text = filterShortcodesFromHTML( text, paper._attributes && paper._attributes.shortcodes ); - text = excludeTableOfContentsTag( text ); const matches = getSubheadingTexts( text ); // An optional custom helper to count length to use instead of countWords. diff --git a/packages/yoastseo/src/languageProcessing/researches/matchKeywordInSubheadings.js b/packages/yoastseo/src/languageProcessing/researches/matchKeywordInSubheadings.js index 82e4a332912..21de6552fe8 100644 --- a/packages/yoastseo/src/languageProcessing/researches/matchKeywordInSubheadings.js +++ b/packages/yoastseo/src/languageProcessing/researches/matchKeywordInSubheadings.js @@ -1,5 +1,4 @@ import { getSubheadingContentsTopLevel } from "../helpers/html/getSubheadings"; -import excludeTableOfContentsTag from "../helpers/sanitize/excludeTableOfContentsTag"; import stripSomeTags from "../helpers/sanitize/stripNonTextTags"; import { findTopicFormsInString } from "../helpers/match/findKeywordFormsInString"; import removeHtmlBlocks from "../helpers/html/htmlParser"; @@ -46,7 +45,7 @@ export default function matchKeywordInSubheadings( paper, researcher ) { let text = paper.getText(); text = removeHtmlBlocks( text ); text = filterShortcodesFromHTML( text, paper._attributes && paper._attributes.shortcodes ); - text = stripSomeTags( excludeTableOfContentsTag( text ) ); + text = stripSomeTags( text ); const topicForms = researcher.getResearch( "morphology" ); const locale = paper.getLocale(); const result = { count: 0, matches: 0, percentReflectingTopic: 0 }; diff --git a/packages/yoastseo/src/parse/build/private/alwaysFilterElements.js b/packages/yoastseo/src/parse/build/private/alwaysFilterElements.js index 7a8f99c3363..52b68fb3020 100644 --- a/packages/yoastseo/src/parse/build/private/alwaysFilterElements.js +++ b/packages/yoastseo/src/parse/build/private/alwaysFilterElements.js @@ -15,8 +15,23 @@ const permanentFilters = [ // Comments are filtered out in `filterBeforeTokenizing.js` step. elementHasClass( "yoast-table-of-contents" ), elementHasClass( "yoast-reading-time__wrapper" ), - // Filters for the Elementor widget Yoast Breadcrumbs. + // Filters for Elementor widgets elementHasID( "breadcrumbs" ), + elementHasClass( "elementor-button-wrapper" ), + elementHasClass( "elementor-divider" ), + elementHasClass( "elementor-spacer" ), + elementHasClass( "elementor-custom-embed" ), + elementHasClass( "elementor-icon-wrapper" ), + elementHasClass( "elementor-icon-box-wrapper" ), + elementHasClass( "elementor-counter" ), + elementHasClass( "elementor-progress-wrapper" ), + // This element is used for the progress bar widget title. + elementHasClass( "elementor-title" ), + elementHasClass( "elementor-alert" ), + elementHasClass( "elementor-soundcloud-wrapper" ), + elementHasClass( "elementor-shortcode" ), + elementHasClass( "elementor-menu-anchor" ), + elementHasClass( "e-rating" ), // Filters out HTML elements. /* Elements are filtered out when: they contain content outside of the author's control (incl. quotes and embedded content); their content isn't natural language (e.g. code); they contain metadata hidden from the page visitor diff --git a/packages/yoastseo/src/parse/build/private/filterTree.js b/packages/yoastseo/src/parse/build/private/filterTree.js index f187dc8a1a4..1e2660726f2 100644 --- a/packages/yoastseo/src/parse/build/private/filterTree.js +++ b/packages/yoastseo/src/parse/build/private/filterTree.js @@ -1,3 +1,5 @@ +import { Paragraph } from "../../structure"; + /** * Checks if a node should be kept or discarded. * @param {Node} node A node. @@ -29,6 +31,11 @@ export default function filterTree( node, filters ) { // Recursively filters the node's children. if ( node.childNodes ) { node.childNodes = node.childNodes.filter( childNode => filterTree( childNode, filters ) ); + + // Drops implicit paragraphs if all their child nodes have been removed. + if ( node.childNodes.length === 0 && node instanceof Paragraph && node.isImplicit ) { + return; + } } return node;