diff --git a/docs/api_docs/python/index.md b/docs/api_docs/python/index.md index c1ca155db..a182c83d3 100644 --- a/docs/api_docs/python/index.md +++ b/docs/api_docs/python/index.md @@ -15,7 +15,7 @@ * text.UnicodeCharTokenizer * text.UnicodeScriptTokenizer * text.WhitespaceTokenizer -* text.WordShape +* text.WordShape * text.WordpieceTokenizer * text.case_fold_utf8 * text.coerce_to_structurally_valid_utf8 diff --git a/docs/api_docs/python/text.md b/docs/api_docs/python/text.md index d90d73682..c8ffffbb5 100644 --- a/docs/api_docs/python/text.md +++ b/docs/api_docs/python/text.md @@ -74,7 +74,7 @@ allocates a length budget to segments in order. [`class WhitespaceTokenizer`](./text/WhitespaceTokenizer.md): Tokenizes a tensor of UTF-8 strings on whitespaces. -[`class WordShape`](./text/WordShape.md): Values for the 'pattern' arg of the +[`class WordShape`](./text/WordShape_cls.md): Values for the 'pattern' arg of the wordshape op. [`class WordpieceTokenizer`](./text/WordpieceTokenizer.md): Tokenizes a tensor diff --git a/docs/api_docs/python/text/WordShape.md b/docs/api_docs/python/text/WordShape.md deleted file mode 100644 index dc0d08fd4..000000000 --- a/docs/api_docs/python/text/WordShape.md +++ /dev/null @@ -1,249 +0,0 @@ -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -# text.WordShape - - - - - -
- -View -source - -Values for the 'pattern' arg of the wordshape op. - - - -The supported wordshape identifiers are: - -* WordShape.BEGINS_WITH_OPEN_QUOTE: - The input begins with an open quote. - - The following strings are considered open quotes: - - ``` - " QUOTATION MARK - ' APOSTROPHE - ` GRAVE ACCENT - `` Pair of GRAVE ACCENTs - \uFF02 FULLWIDTH QUOTATION MARK - \uFF07 FULLWIDTH APOSTROPHE - \u00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - \u2018 LEFT SINGLE QUOTATION MARK - \u201A SINGLE LOW-9 QUOTATION MARK - \u201B SINGLE HIGH-REVERSED-9 QUOTATION MARK - \u201C LEFT DOUBLE QUOTATION MARK - \u201E DOUBLE LOW-9 QUOTATION MARK - \u201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK - \u2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK - \u300C LEFT CORNER BRACKET - \u300E LEFT WHITE CORNER BRACKET - \u301D REVERSED DOUBLE PRIME QUOTATION MARK - \u2E42 DOUBLE LOW-REVERSED-9 QUOTATION MARK - \uFF62 HALFWIDTH LEFT CORNER BRACKET - \uFE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET - \uFE43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET - ``` - - Note: U+B4 (acute accent) not included. - -* WordShape.BEGINS_WITH_PUNCT_OR_SYMBOL: - The input starts with a punctuation or symbol character. - -* WordShape.ENDS_WITH_CLOSE_QUOTE: - The input ends witha closing quote character. - - The following strings are considered close quotes: - - ``` - " QUOTATION MARK - ' APOSTROPHE - ` GRAVE ACCENT - '' Pair of APOSTROPHEs - \uFF02 FULLWIDTH QUOTATION MARK - \uFF07 FULLWIDTH APOSTROPHE - \u00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - \u2019 RIGHT SINGLE QUOTATION MARK - \u201D RIGHT DOUBLE QUOTATION MARK - \u203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - \u300D RIGHT CORNER BRACKET - \u300F RIGHT WHITE CORNER BRACKET - \u301E DOUBLE PRIME QUOTATION MARK - \u301F LOW DOUBLE PRIME QUOTATION MARK - \uFE42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET - \uFE44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET - \uFF63 HALFWIDTH RIGHT CORNER BRACKET - ``` - - Note: U+B4 (ACUTE ACCENT) is not included. - -* WordShape.ENDS_WITH_ELLIPSIS: - The input ends with an ellipsis (i.e. with three or more periods or a - unicode ellipsis character). - -* WordShape.ENDS_WITH_EMOTICON: - The input ends with an emoticon. - -* WordShape.ENDS_WITH_MULTIPLE_SENTENCE_TERMINAL: - The input ends with multiple sentence-terminal characters. - -* WordShape.ENDS_WITH_MULTIPLE_TERMINAL_PUNCT: - The input ends with multiple terminal-punctuation characters. - -* WordShape.ENDS_WITH_PUNCT_OR_SYMBOL: - The input ends with a punctuation or symbol character. - -* WordShape.ENDS_WITH_SENTENCE_TERMINAL: - The input ends with a sentence-terminal character. - -* WordShape.ENDS_WITH_TERMINAL_PUNCT: - The input ends with a terminal-punctuation character. - -* WordShape.HAS_CURRENCY_SYMBOL: - The input contains a currency symbol. - -* WordShape.HAS_EMOJI: - The input contains an emoji character. - - See http://www.unicode.org/Public/emoji/1.0//emoji-data.txt. Emojis are in - unicode ranges `2600-26FF`, `1F300-1F6FF`, and `1F900-1F9FF`. - -* WordShape.HAS_MATH_SYMBOL: - The input contains a mathematical symbol. - -* WordShape.HAS_MIXED_CASE: - The input contains both uppercase and lowercase letterforms. - -* WordShape.HAS_NON_LETTER: - The input contains a non-letter character. - -* WordShape.HAS_NO_DIGITS: - The input contains no digit characters. - -* WordShape.HAS_NO_PUNCT_OR_SYMBOL: - The input contains no unicode punctuation or symbol characters. - -* WordShape.HAS_NO_QUOTES: - The input string contains no quote characters. - -* WordShape.HAS_ONLY_DIGITS: - The input consists entirely of unicode digit characters. - -* WordShape.HAS_PUNCTUATION_DASH: - The input contains at least one unicode dash character. - - Note that this uses the Pd (Dash) unicode property. This property will not - match to soft-hyphens and katakana middle dot characters. - -* WordShape.HAS_QUOTE: - The input starts or ends with a unicode quotation mark. - -* WordShape.HAS_SOME_DIGITS: - The input contains a mix of digit characters and non-digit characters. - -* WordShape.HAS_SOME_PUNCT_OR_SYMBOL: - The input contains a mix of punctuation or symbol characters, and - non-punctuation non-symbol characters. - -* WordShape.HAS_TITLE_CASE: - The input has title case (i.e. the first character is upper or title case, - and the remaining characters are lowercase). - -* WordShape.IS_ACRONYM_WITH_PERIODS: - The input is a period-separated acronym. This matches for strings of the - form "I.B.M." but not "IBM". - -* WordShape.IS_EMOTICON: - The input is a single emoticon. - -* WordShape.IS_LOWERCASE: - The input contains only lowercase letterforms. - -* WordShape.IS_MIXED_CASE_LETTERS: - The input contains only uppercase and lowercase letterforms. - -* WordShape.IS_NUMERIC_VALUE: - The input is parseable as a numeric value. This will match a fairly broad - set of floating point and integer representations (but not Nan or Inf). - -* WordShape.IS_PUNCT_OR_SYMBOL: - The input contains only punctuation and symbol characters. - -* WordShape.IS_UPPERCASE: - The input contains only uppercase letterforms. - -* WordShape.IS_WHITESPACE: - The input consists entirely of whitespace. - -## Class Variables - -* `BEGINS_WITH_OPEN_QUOTE` -* `BEGINS_WITH_PUNCT_OR_SYMBOL` -* `ENDS_WITH_CLOSE_QUOTE` -* `ENDS_WITH_ELLIPSIS` -* `ENDS_WITH_EMOTICON` -* `ENDS_WITH_MULTIPLE_SENTENCE_TERMINAL` - -* `ENDS_WITH_MULTIPLE_TERMINAL_PUNCT` - -* `ENDS_WITH_PUNCT_OR_SYMBOL` -* `ENDS_WITH_SENTENCE_TERMINAL` -* `ENDS_WITH_TERMINAL_PUNCT` -* `HAS_CURRENCY_SYMBOL` -* `HAS_EMOJI` -* `HAS_MATH_SYMBOL` -* `HAS_MIXED_CASE` -* `HAS_NON_LETTER` -* `HAS_NO_DIGITS` -* `HAS_NO_PUNCT_OR_SYMBOL` -* `HAS_NO_QUOTES` -* `HAS_ONLY_DIGITS` -* `HAS_PUNCTUATION_DASH` -* `HAS_QUOTE` -* `HAS_SOME_DIGITS` -* `HAS_SOME_PUNCT_OR_SYMBOL` -* `HAS_TITLE_CASE` -* `IS_ACRONYM_WITH_PERIODS` -* `IS_EMOTICON` -* `IS_LOWERCASE` -* `IS_MIXED_CASE_LETTERS` -* `IS_NUMERIC_VALUE` -* `IS_PUNCT_OR_SYMBOL` -* `IS_UPPERCASE` -* `IS_WHITESPACE` diff --git a/docs/api_docs/python/text/WordShape_cls.md b/docs/api_docs/python/text/WordShape_cls.md new file mode 100644 index 000000000..85fe36088 --- /dev/null +++ b/docs/api_docs/python/text/WordShape_cls.md @@ -0,0 +1,444 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +# text.WordShape + + + + + +
+ +View +source + +Values for the 'pattern' arg of the wordshape op. + + + +The supported wordshape identifiers are: + +* WordShape.BEGINS_WITH_OPEN_QUOTE: + The input begins with an open quote. + + The following strings are considered open quotes: + + ``` + " QUOTATION MARK + ' APOSTROPHE + ` GRAVE ACCENT + `` Pair of GRAVE ACCENTs + \uFF02 FULLWIDTH QUOTATION MARK + \uFF07 FULLWIDTH APOSTROPHE + \u00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + \u2018 LEFT SINGLE QUOTATION MARK + \u201A SINGLE LOW-9 QUOTATION MARK + \u201B SINGLE HIGH-REVERSED-9 QUOTATION MARK + \u201C LEFT DOUBLE QUOTATION MARK + \u201E DOUBLE LOW-9 QUOTATION MARK + \u201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK + \u2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + \u300C LEFT CORNER BRACKET + \u300E LEFT WHITE CORNER BRACKET + \u301D REVERSED DOUBLE PRIME QUOTATION MARK + \u2E42 DOUBLE LOW-REVERSED-9 QUOTATION MARK + \uFF62 HALFWIDTH LEFT CORNER BRACKET + \uFE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET + \uFE43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET + ``` + + Note: U+B4 (acute accent) not included. + +* WordShape.BEGINS_WITH_PUNCT_OR_SYMBOL: + The input starts with a punctuation or symbol character. + +* WordShape.ENDS_WITH_CLOSE_QUOTE: + The input ends witha closing quote character. + + The following strings are considered close quotes: + + ``` + " QUOTATION MARK + ' APOSTROPHE + ` GRAVE ACCENT + '' Pair of APOSTROPHEs + \uFF02 FULLWIDTH QUOTATION MARK + \uFF07 FULLWIDTH APOSTROPHE + \u00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + \u2019 RIGHT SINGLE QUOTATION MARK + \u201D RIGHT DOUBLE QUOTATION MARK + \u203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + \u300D RIGHT CORNER BRACKET + \u300F RIGHT WHITE CORNER BRACKET + \u301E DOUBLE PRIME QUOTATION MARK + \u301F LOW DOUBLE PRIME QUOTATION MARK + \uFE42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET + \uFE44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET + \uFF63 HALFWIDTH RIGHT CORNER BRACKET + ``` + + Note: U+B4 (ACUTE ACCENT) is not included. + +* WordShape.ENDS_WITH_ELLIPSIS: + The input ends with an ellipsis (i.e. with three or more periods or a + unicode ellipsis character). + +* WordShape.ENDS_WITH_EMOTICON: + The input ends with an emoticon. + +* WordShape.ENDS_WITH_MULTIPLE_SENTENCE_TERMINAL: + The input ends with multiple sentence-terminal characters. + +* WordShape.ENDS_WITH_MULTIPLE_TERMINAL_PUNCT: + The input ends with multiple terminal-punctuation characters. + +* WordShape.ENDS_WITH_PUNCT_OR_SYMBOL: + The input ends with a punctuation or symbol character. + +* WordShape.ENDS_WITH_SENTENCE_TERMINAL: + The input ends with a sentence-terminal character. + +* WordShape.ENDS_WITH_TERMINAL_PUNCT: + The input ends with a terminal-punctuation character. + +* WordShape.HAS_CURRENCY_SYMBOL: + The input contains a currency symbol. + +* WordShape.HAS_EMOJI: + The input contains an emoji character. + + See http://www.unicode.org/Public/emoji/1.0//emoji-data.txt. Emojis are in + unicode ranges `2600-26FF`, `1F300-1F6FF`, and `1F900-1F9FF`. + +* WordShape.HAS_MATH_SYMBOL: + The input contains a mathematical symbol. + +* WordShape.HAS_MIXED_CASE: + The input contains both uppercase and lowercase letterforms. + +* WordShape.HAS_NON_LETTER: + The input contains a non-letter character. + +* WordShape.HAS_NO_DIGITS: + The input contains no digit characters. + +* WordShape.HAS_NO_PUNCT_OR_SYMBOL: + The input contains no unicode punctuation or symbol characters. + +* WordShape.HAS_NO_QUOTES: + The input string contains no quote characters. + +* WordShape.HAS_ONLY_DIGITS: + The input consists entirely of unicode digit characters. + +* WordShape.HAS_PUNCTUATION_DASH: + The input contains at least one unicode dash character. + + Note that this uses the Pd (Dash) unicode property. This property will not + match to soft-hyphens and katakana middle dot characters. + +* WordShape.HAS_QUOTE: + The input starts or ends with a unicode quotation mark. + +* WordShape.HAS_SOME_DIGITS: + The input contains a mix of digit characters and non-digit characters. + +* WordShape.HAS_SOME_PUNCT_OR_SYMBOL: + The input contains a mix of punctuation or symbol characters, and + non-punctuation non-symbol characters. + +* WordShape.HAS_TITLE_CASE: + The input has title case (i.e. the first character is upper or title case, + and the remaining characters are lowercase). + +* WordShape.IS_ACRONYM_WITH_PERIODS: + The input is a period-separated acronym. This matches for strings of the + form "I.B.M." but not "IBM". + +* WordShape.IS_EMOTICON: + The input is a single emoticon. + +* WordShape.IS_LOWERCASE: + The input contains only lowercase letterforms. + +* WordShape.IS_MIXED_CASE_LETTERS: + The input contains only uppercase and lowercase letterforms. + +* WordShape.IS_NUMERIC_VALUE: + The input is parseable as a numeric value. This will match a fairly broad + set of floating point and integer representations (but not Nan or Inf). + +* WordShape.IS_PUNCT_OR_SYMBOL: + The input contains only punctuation and symbol characters. + +* WordShape.IS_UPPERCASE: + The input contains only uppercase letterforms. + +* WordShape.IS_WHITESPACE: + The input consists entirely of whitespace. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+BEGINS_WITH_OPEN_QUOTE + +text.WordShape +
+BEGINS_WITH_PUNCT_OR_SYMBOL + +text.WordShape +
+ENDS_WITH_CLOSE_QUOTE + +text.WordShape +
+ENDS_WITH_ELLIPSIS + +text.WordShape +
+ENDS_WITH_EMOTICON + +text.WordShape +
+ENDS_WITH_MULTIPLE_SENTENCE_TERMINAL + +text.WordShape +
+ENDS_WITH_MULTIPLE_TERMINAL_PUNCT + +text.WordShape +
+ENDS_WITH_PUNCT_OR_SYMBOL + +text.WordShape +
+ENDS_WITH_SENTENCE_TERMINAL + +text.WordShape +
+ENDS_WITH_TERMINAL_PUNCT + +text.WordShape +
+HAS_CURRENCY_SYMBOL + +text.WordShape +
+HAS_EMOJI + +text.WordShape +
+HAS_MATH_SYMBOL + +text.WordShape +
+HAS_MIXED_CASE + +text.WordShape +
+HAS_NON_LETTER + +text.WordShape +
+HAS_NO_DIGITS + +text.WordShape +
+HAS_NO_PUNCT_OR_SYMBOL + +text.WordShape +
+HAS_NO_QUOTES + +text.WordShape +
+HAS_ONLY_DIGITS + +text.WordShape +
+HAS_PUNCTUATION_DASH + +text.WordShape +
+HAS_QUOTE + +text.WordShape +
+HAS_SOME_DIGITS + +text.WordShape +
+HAS_SOME_PUNCT_OR_SYMBOL + +text.WordShape +
+HAS_TITLE_CASE + +text.WordShape +
+IS_ACRONYM_WITH_PERIODS + +text.WordShape +
+IS_EMOTICON + +text.WordShape +
+IS_LOWERCASE + +text.WordShape +
+IS_MIXED_CASE_LETTERS + +text.WordShape +
+IS_NUMERIC_VALUE + +text.WordShape +
+IS_PUNCT_OR_SYMBOL + +text.WordShape +
+IS_UPPERCASE + +text.WordShape +
+IS_WHITESPACE + +text.WordShape +
diff --git a/docs/api_docs/python/text/all_symbols.md b/docs/api_docs/python/text/all_symbols.md index e8e96dce2..69c531f08 100644 --- a/docs/api_docs/python/text/all_symbols.md +++ b/docs/api_docs/python/text/all_symbols.md @@ -24,7 +24,7 @@ * text.UnicodeScriptTokenizer * text.WaterfallTrimmer * text.WhitespaceTokenizer -* text.WordShape +* text.WordShape * text.WordpieceTokenizer * text.case_fold_utf8 * text.coerce_to_structurally_valid_utf8