Viewing File: /home/ubuntu/.local/lib/python3.10/site-packages/tests/test_text_processor.py

#!/usr/bin/env python3
"""Tests for TextProcessor"""
import sys
import unittest

from gruut.text_processor import Sentence, TextProcessor, TextProcessorSettings, Word
from gruut.utils import print_graph

WORDS_KWARGS = {"explicit_lang": False, "phonemes": False, "pos": False}


class TextProcessorTestCase(unittest.TestCase):
    """Tests for TextProcessor"""

    def test_whitespace(self):
        """Text whitespace preservation"""
        processor = TextProcessor()
        graph, root = processor("This is  a   test    ")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Whitespace is retained by default
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
                Word(idx=1, sent_idx=0, text="is", text_with_ws="is  "),
                Word(idx=2, sent_idx=0, text="a", text_with_ws="a   "),
                Word(idx=3, sent_idx=0, text="test", text_with_ws="test    "),
            ],
        )

    def test_no_whitespace(self):
        """Test disabling of whitespace preservation"""
        processor = TextProcessor(keep_whitespace=False)
        graph, root = processor("This is  a   test    ")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Whitespace is discarded
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
                Word(idx=1, sent_idx=0, text="is", text_with_ws="is"),
                Word(idx=2, sent_idx=0, text="a", text_with_ws="a"),
                Word(idx=3, sent_idx=0, text="test", text_with_ws="test"),
            ],
        )

    def test_punctuation(self):
        """Test splitting of punctuation from around words"""
        processor = TextProcessor(
            begin_punctuations={'"', "«"},
            end_punctuations={'"', "»"},
            minor_breaks={","},
            major_breaks={"."},
        )
        graph, root = processor('This «is»,  a "test".')
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Punctuations are separated
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
                Word(
                    idx=1, sent_idx=0, text="«", text_with_ws="«", is_punctuation=True
                ),
                Word(idx=2, sent_idx=0, text="is", text_with_ws="is"),
                Word(
                    idx=3, sent_idx=0, text="»", text_with_ws="»", is_punctuation=True
                ),
                Word(
                    idx=4, sent_idx=0, text=",", text_with_ws=",  ", is_minor_break=True
                ),
                Word(idx=5, sent_idx=0, text="a", text_with_ws="a "),
                Word(
                    idx=6, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
                ),
                Word(idx=7, sent_idx=0, text="test", text_with_ws="test"),
                Word(
                    idx=8, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
                ),
                Word(
                    idx=9, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
                ),
            ],
        )

    def test_punctuation_with_inner_break(self):
        """Test break inside of punctuation"""
        processor = TextProcessor(
            begin_punctuations={'"'}, end_punctuations={'"'}, major_breaks={"."},
        )
        graph, root = processor('Test "one." Test two.')
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # First sentence includes final quote
        self.assertEqual(
            words,
            [
                # First sentence
                Word(idx=0, sent_idx=0, text="Test", text_with_ws="Test "),
                Word(
                    idx=1, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
                ),
                Word(idx=2, sent_idx=0, text="one", text_with_ws="one"),
                Word(
                    idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
                ),
                Word(
                    idx=4, sent_idx=0, text='"', text_with_ws='" ', is_punctuation=True
                ),
                # Second sentence
                Word(idx=0, sent_idx=1, text="Test", text_with_ws="Test "),
                Word(idx=1, sent_idx=1, text="two", text_with_ws="two"),
                Word(
                    idx=2, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
                ),
            ],
        )

    def test_replacements(self):
        """Test regex replacements during tokenization"""
        processor = TextProcessor(
            minor_breaks={","},
            major_breaks={"."},
            replacements=[
                ("\\B'", '"'),  # replace single quotes
                ("'\\B", '"'),
                ('[\\<\\>\\(\\)\\[\\]"]+', ""),  # drop brackets/quotes
            ],
        )
        graph, root = processor("\"This,\" [is] <a> (test) 'sentence.'")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Quotes and brackets are discarded
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
                Word(
                    idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
                ),
                Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
                Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
                Word(idx=4, sent_idx=0, text="test", text_with_ws="test "),
                Word(idx=5, sent_idx=0, text="sentence", text_with_ws="sentence"),
                Word(
                    idx=6, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
                ),
            ],
        )

    def test_abbreviations(self):
        """Test expansion of abbreviations (with case preservation)"""
        processor = TextProcessor(
            minor_breaks={","},
            major_breaks={".", "?"},
            abbreviations={
                r"^([dD])r\.": r"\1octor",
                r"^([mM])r\.": r"\1ister",
                r"^([sS])t\.": r"\1treet",
            },
        )
        graph, root = processor("Mr.? I'm just a dr., on this St. at least.")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Abbreviations are expanded, maintaining capitalization
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="Mister", text_with_ws="Mister"),
                Word(
                    idx=1, sent_idx=0, text="?", text_with_ws="? ", is_major_break=True
                ),
                Word(idx=0, sent_idx=1, text="I'm", text_with_ws="I'm "),
                Word(idx=1, sent_idx=1, text="just", text_with_ws="just "),
                Word(idx=2, sent_idx=1, text="a", text_with_ws="a "),
                Word(idx=3, sent_idx=1, text="doctor", text_with_ws="doctor"),
                Word(
                    idx=4, sent_idx=1, text=",", text_with_ws=", ", is_minor_break=True
                ),
                Word(idx=5, sent_idx=1, text="on", text_with_ws="on "),
                Word(idx=6, sent_idx=1, text="this", text_with_ws="this "),
                Word(idx=7, sent_idx=1, text="Street", text_with_ws="Street "),
                Word(idx=8, sent_idx=1, text="at", text_with_ws="at "),
                Word(idx=9, sent_idx=1, text="least", text_with_ws="least"),
                Word(
                    idx=10, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
                ),
            ],
        )

    def test_multiple_sentences(self):
        """Test sentence break"""
        processor = TextProcessor(major_breaks={".", "!"})
        graph, root = processor("First  sentence. Second sentence! ")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Separated by a major break
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="First", text_with_ws="First  "),
                Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
                Word(
                    idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
                ),
                Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
                Word(idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"),
                Word(
                    idx=2, sent_idx=1, text="!", text_with_ws="! ", is_major_break=True
                ),
            ],
        )

        # Check sentences too
        sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
        self.assertEqual(
            sentences,
            [
                Sentence(
                    idx=0,
                    text="First sentence.",
                    text_with_ws="First  sentence. ",
                    text_spoken="First sentence",
                    words=[
                        Word(idx=0, sent_idx=0, text="First", text_with_ws="First  "),
                        Word(
                            idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"
                        ),
                        Word(
                            idx=2,
                            sent_idx=0,
                            text=".",
                            text_with_ws=". ",
                            is_major_break=True,
                        ),
                    ],
                ),
                Sentence(
                    idx=1,
                    text="Second sentence!",
                    text_with_ws="Second sentence! ",
                    text_spoken="Second sentence",
                    words=[
                        Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
                        Word(
                            idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"
                        ),
                        Word(
                            idx=2,
                            sent_idx=1,
                            text="!",
                            text_with_ws="! ",
                            is_major_break=True,
                        ),
                    ],
                ),
            ],
        )

    def test_multiple_paragraphs(self):
        """Test paragraph index"""
        processor = TextProcessor()
        graph, root = processor(
            "<speak><p>First paragraph</p><p>Second paragraph</p></speak>", ssml=True
        )
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Sentences/words should be in different paragraphs
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, par_idx=0, text="First", text_with_ws="First "),
                Word(
                    idx=1,
                    sent_idx=0,
                    par_idx=0,
                    text="paragraph",
                    text_with_ws="paragraph",
                ),
                Word(
                    idx=0, sent_idx=0, par_idx=1, text="Second", text_with_ws="Second "
                ),
                Word(
                    idx=1,
                    sent_idx=0,
                    par_idx=1,
                    text="paragraph",
                    text_with_ws="paragraph",
                ),
            ],
        )

    def test_explicit_sentence(self):
        """Test <s> in SSML for avoiding sentence break"""
        processor = TextProcessor(major_breaks={".", "!"})
        graph, root = processor("<s>First sentence. Second sentence!</s>", ssml=True)
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Sentences should not be split apart
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
                Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
                Word(
                    idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
                ),
                Word(idx=3, sent_idx=0, text="Second", text_with_ws="Second "),
                Word(idx=4, sent_idx=0, text="sentence", text_with_ws="sentence"),
                Word(
                    idx=5, sent_idx=0, text="!", text_with_ws="!", is_major_break=True
                ),
            ],
        )

    def test_minor_breaks(self):
        """Test minor (phrase) break"""
        processor = TextProcessor(minor_breaks={","})
        graph, root = processor("this, is a test")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Comma should be split from word
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="this", text_with_ws="this"),
                Word(
                    idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
                ),
                Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
                Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
                Word(idx=4, sent_idx=0, text="test", text_with_ws="test"),
            ],
        )

    def test_word_breaks(self):
        """Test inner-word break"""
        processor = TextProcessor(word_breaks={"-"})
        graph, root = processor("ninety-nine")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Word should be split
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="ninety", text_with_ws="ninety "),
                Word(idx=1, sent_idx=0, text="nine", text_with_ws="nine"),
            ],
        )

    def test_spell_out(self):
        """Test interpret-as="spell-out" in SSML"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor(
            '<say-as interpret-as="spell-out">test123</say-as>', ssml=True
        )
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        print_graph(graph, root)

        # Word should be split into letters
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="t", text_with_ws="t "),
                Word(idx=1, sent_idx=0, text="e", text_with_ws="e "),
                Word(idx=2, sent_idx=0, text="s", text_with_ws="s "),
                Word(idx=3, sent_idx=0, text="t", text_with_ws="t "),
                Word(idx=4, sent_idx=0, text="one", text_with_ws="one "),
                Word(idx=5, sent_idx=0, text="two", text_with_ws="two "),
                Word(idx=6, sent_idx=0, text="three", text_with_ws="three"),
            ],
        )

    def test_initialisms(self):
        """Test initialism spell out"""
        processor = TextProcessor(
            major_breaks={"."},
            is_initialism=lambda s: s.isalpha() and s.isupper(),
            split_initialism=list,
        )
        graph, root = processor("TTS.")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Letters should be split
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="T", text_with_ws="T "),
                Word(idx=1, sent_idx=0, text="T", text_with_ws="T "),
                Word(idx=2, sent_idx=0, text="S", text_with_ws="S"),
                Word(
                    idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
                ),
            ],
        )

    def test_numbers_one_language(self):
        """Test number verbalization (single language)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor("1 2 3")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Numbers should be verbalized
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="one", text_with_ws="one "),
                Word(idx=1, sent_idx=0, text="two", text_with_ws="two "),
                Word(idx=2, sent_idx=0, text="three", text_with_ws="three"),
            ],
        )

    def test_numbers_multiple_languages(self):
        """Test number verbalization (SSML, multiple languages)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor(
            '1 <w lang="es_ES">2</w> <w lang="de_DE">3</w>', ssml=True
        )
        words = list(processor.words(graph, root, phonemes=False))

        # Numbers should be verbalized
        self.assertEqual(
            words,
            [
                Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
                Word(lang="es_ES", idx=1, sent_idx=0, text="dos", text_with_ws="dos "),
                Word(lang="de_DE", idx=2, sent_idx=0, text="drei", text_with_ws="drei"),
            ],
        )

    def test_currency_one_language(self):
        """Test currency verbalization (single language)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor("$10")
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Currency should be verbalized
        self.assertEqual(
            words,
            [
                Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
                Word(
                    lang="en_US",
                    idx=1,
                    sent_idx=0,
                    text="dollars",
                    text_with_ws="dollars",
                ),
            ],
        )

    def test_currency_multiple_language(self):
        """Test currency verbalization (SSML, multiple languages)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor(
            '€10 <w lang="fr_FR">€10</w> <w lang="nl_NL">€10</w>',
            ssml=True,
            phonemize=False,
        )
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Currencies should be verbalized
        self.assertEqual(
            words,
            [
                Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
                Word(
                    lang="en_US", idx=1, sent_idx=0, text="euro", text_with_ws="euro "
                ),
                Word(lang="fr_FR", idx=2, sent_idx=0, text="dix", text_with_ws="dix "),
                Word(
                    lang="fr_FR", idx=3, sent_idx=0, text="euros", text_with_ws="euros "
                ),
                Word(
                    lang="nl_NL", idx=4, sent_idx=0, text="tien", text_with_ws="tien "
                ),
                Word(lang="nl_NL", idx=5, sent_idx=0, text="euro", text_with_ws="euro"),
            ],
        )

    def test_currency_default(self):
        """Test default currency use when no currency symbol (interpret-as="currency")"""
        processor = TextProcessor(default_lang="en_US", default_currency="USD")
        graph, root = processor(
            '<say-as interpret-as="currency">10</say-as>', ssml=True
        )
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Currency should be verbalized, despite lack of "$" symbol
        self.assertEqual(
            words,
            [
                Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
                Word(
                    lang="en_US",
                    idx=1,
                    sent_idx=0,
                    text="dollars",
                    text_with_ws="dollars",
                ),
            ],
        )

    def test_time(self):
        """Test time verbalization (English)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor("  4:01pm")
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Time should be verbalized
        self.assertEqual(
            words,
            [
                Word(
                    lang="en_US", idx=0, sent_idx=0, text="four", text_with_ws="  four "
                ),
                Word(lang="en_US", idx=1, sent_idx=0, text="oh", text_with_ws="oh "),
                Word(lang="en_US", idx=2, sent_idx=0, text="one", text_with_ws="one "),
                Word(lang="en_US", idx=3, sent_idx=0, text="P", text_with_ws="P "),
                Word(lang="en_US", idx=4, sent_idx=0, text="M", text_with_ws="M"),
            ],
        )

    def test_time_no_colon(self):
        """Test time verbalization without a colon (English)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor("10am")
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Time should be verbalized
        self.assertEqual(
            words,
            [
                Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
                Word(lang="en_US", idx=1, sent_idx=0, text="A", text_with_ws="A "),
                Word(lang="en_US", idx=2, sent_idx=0, text="M", text_with_ws="M"),
            ],
        )

    def test_date_one_language(self):
        """Test date verbalization (single language)"""
        processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
        graph, root = processor("4/1/1999")
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Date should be verbalized
        self.assertEqual(
            words,
            [
                Word(
                    lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
                ),
                Word(
                    lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
                ),
                Word(
                    lang="en_US",
                    idx=2,
                    sent_idx=0,
                    text=",",
                    text_with_ws=", ",
                    is_minor_break=True,
                ),
                Word(
                    lang="en_US",
                    idx=3,
                    sent_idx=0,
                    text="nineteen",
                    text_with_ws="nineteen ",
                ),
                Word(
                    lang="en_US",
                    idx=4,
                    sent_idx=0,
                    text="ninety",
                    text_with_ws="ninety ",
                ),
                Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
            ],
        )

    def test_date_multiple_languages(self):
        """Test date verbalization (SSML, multiple languages)"""
        processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
        graph, root = processor(
            '<speak><s>4/1/1999</s> <s lang="fr_FR">4/1/1999</s><s lang="de_DE">01.04.1999</s></speak>',
            ssml=True,
            phonemize=False,  # ensure French year is split
        )
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Date should be verbalized
        self.assertEqual(
            words,
            [
                # English
                Word(
                    lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
                ),
                Word(
                    lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
                ),
                Word(
                    lang="en_US",
                    idx=2,
                    sent_idx=0,
                    text=",",
                    text_with_ws=", ",
                    is_minor_break=True,
                ),
                Word(
                    lang="en_US",
                    idx=3,
                    sent_idx=0,
                    text="nineteen",
                    text_with_ws="nineteen ",
                ),
                Word(
                    lang="en_US",
                    idx=4,
                    sent_idx=0,
                    text="ninety",
                    text_with_ws="ninety ",
                ),
                Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
                # French
                Word(
                    lang="fr_FR",
                    idx=0,
                    sent_idx=1,
                    text="quatrième",
                    text_with_ws="quatrième ",
                ),
                Word(
                    lang="fr_FR",
                    idx=1,
                    sent_idx=1,
                    text="janvier",
                    text_with_ws="janvier ",
                ),
                Word(
                    lang="fr_FR", idx=2, sent_idx=1, text="mille", text_with_ws="mille "
                ),
                Word(
                    lang="fr_FR", idx=3, sent_idx=1, text="neuf", text_with_ws="neuf "
                ),
                Word(
                    lang="fr_FR", idx=4, sent_idx=1, text="cent", text_with_ws="cent "
                ),
                Word(
                    lang="fr_FR",
                    idx=5,
                    sent_idx=1,
                    text="quatre",
                    text_with_ws="quatre ",
                ),
                Word(
                    lang="fr_FR", idx=6, sent_idx=1, text="vingt", text_with_ws="vingt "
                ),
                Word(lang="fr_FR", idx=7, sent_idx=1, text="dix", text_with_ws="dix "),
                Word(lang="fr_FR", idx=8, sent_idx=1, text="neuf", text_with_ws="neuf"),
                # German
                Word(
                    lang="de_DE",
                    idx=0,
                    sent_idx=2,
                    text="erste",
                    text_with_ws="erste ",
                ),
                Word(
                    lang="de_DE",
                    idx=1,
                    sent_idx=2,
                    text="April",
                    text_with_ws="April ",
                ),
                Word(
                    lang="de_DE",
                    idx=2,
                    sent_idx=2,
                    text="neunzehnhundertneunundneunzig",
                    text_with_ws="neunzehnhundertneunundneunzig",
                ),
            ],
        )

    def test_date_format_ordinal(self):
        """Test date format in SSML (ordinal)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor(
            '<say-as interpret-as="date" format="md">4/1</say-as>', ssml=True
        )
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Date is forced to be interpreted and format using day ordinal (first)
        self.assertEqual(
            words,
            [
                Word(
                    lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
                ),
                Word(lang="en_US", idx=1, sent_idx=0, text="one", text_with_ws="one"),
            ],
        )

    def test_date_format_cardinal(self):
        """Test date format in SSML (cardinal)"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor(
            '<say-as interpret-as="date" format="dmy">4/1/2000</say-as>', ssml=True
        )
        words = list(processor.words(graph, root, phonemes=False, pos=False))

        # Date is forced to be interpreted and format using day ordinal (first)
        self.assertEqual(
            words,
            [
                Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
                Word(
                    lang="en_US", idx=1, sent_idx=0, text="April", text_with_ws="April "
                ),
                Word(lang="en_US", idx=2, sent_idx=0, text="two", text_with_ws="two "),
                Word(
                    lang="en_US",
                    idx=3,
                    sent_idx=0,
                    text="thousand",
                    text_with_ws="thousand",
                ),
            ],
        )

    def test_part_of_speech_tagging(self):
        """Test part-of-speech tagging"""

        def get_parts_of_speech(words, *args, **kwargs):
            return [w.upper() for w in words]

        processor = TextProcessor(
            # Made-up tagger that just gives the UPPER of the word back
            get_parts_of_speech=get_parts_of_speech
        )
        graph, root = processor("a test")
        words = list(processor.words(graph, root, explicit_lang=False, phonemes=False))

        # Fake POS tags are added
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="a", text_with_ws="a ", pos="A"),
                Word(idx=1, sent_idx=0, text="test", text_with_ws="test", pos="TEST"),
            ],
        )

    def test_phonemize_one_language(self):
        """Test phonemizer (single language)"""

        def lookup_phonemes(word: str, *args, **kwargs):
            return list(word)

        processor = TextProcessor(
            # Made-up phonemizer that just gives back the letters
            lookup_phonemes=lookup_phonemes,
        )
        graph, root = processor("test")
        words = list(processor.words(graph, root, pos=False, explicit_lang=False))

        # Single word is "phonemized"
        self.assertEqual(
            words,
            [
                Word(
                    idx=0,
                    sent_idx=0,
                    text="test",
                    text_with_ws="test",
                    phonemes=["t", "e", "s", "t"],
                ),
            ],
        )

    def test_phonemize_one_language_multiple_roles(self):
        """Test phonemizer (SSML, multiple word roles)"""

        def lookup_phonemes(word, role=None, **kwargs):
            return list(word) if not role else list(word.upper())

        processor = TextProcessor(
            # Made-up phonemizer that gives back upper-case letters if a role is provided
            lookup_phonemes=lookup_phonemes
        )

        # Use made-up role
        graph, root = processor(
            '<speak>test <w role="some_role">test</w></speak>', ssml=True, pos=False
        )
        words = list(processor.words(graph, root, pos=False, explicit_lang=False))

        # Single word is phonemized two different manners depending on role
        self.assertEqual(
            words,
            [
                Word(
                    idx=0,
                    sent_idx=0,
                    text="test",
                    text_with_ws="test ",
                    phonemes=["t", "e", "s", "t"],
                ),
                Word(
                    idx=1,
                    sent_idx=0,
                    text="test",
                    text_with_ws="test",
                    phonemes=["T", "E", "S", "T"],
                ),
            ],
        )

    def test_phonemize_multiple_languages(self):
        """Test phonemizer (SSML, multiple languages)"""

        def en_lookup_phonemes(word: str, *args, **kwargs):
            return list(word)

        def de_lookup_phonemes(word: str, *args, **kwargs):
            return list(word.upper())

        processor = TextProcessor(
            default_lang="en_US",
            lookup_phonemes=en_lookup_phonemes,
            settings={
                "de_DE": TextProcessorSettings(
                    lang="de_DE", lookup_phonemes=de_lookup_phonemes
                )
            },
        )
        graph, root = processor(
            '<speak>test <w lang="de_DE">test</w></speak>', ssml=True
        )
        words = list(processor.words(graph, root))

        # Single word is phonemized according to the lexicon with two different languages
        self.assertEqual(
            words,
            [
                Word(
                    lang="en_US",
                    idx=0,
                    sent_idx=0,
                    text="test",
                    text_with_ws="test ",
                    phonemes=["t", "e", "s", "t"],
                ),
                Word(
                    lang="de_DE",
                    idx=1,
                    sent_idx=0,
                    text="test",
                    text_with_ws="test",
                    phonemes=["T", "E", "S", "T"],
                ),
            ],
        )

    def test_sub(self):
        """Test SSML substitution"""
        processor = TextProcessor(default_lang="en_US")
        graph, root = processor(
            '<speak><sub alias="World Wide Web Consortium">W3C</sub></speak>', ssml=True
        )
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Single word is replaced by multiple words
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="World", text_with_ws="World ",),
                Word(idx=1, sent_idx=0, text="Wide", text_with_ws="Wide ",),
                Word(idx=2, sent_idx=0, text="Web", text_with_ws="Web ",),
                Word(idx=3, sent_idx=0, text="Consortium", text_with_ws="Consortium",),
            ],
        )

    def test_break(self):
        """Test SSML break tag"""
        processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
        graph, root = processor(
            """
        <speak>
          <break time="1s"/>
          <p>
            <break time="2s" />
            <s>
              <break time="3s" />
              Break <break time="4s" /> here
            </s>
            <break time="5s" />
          </p>
          <break time="6s" />
        </speak>
        """,
            ssml=True,
        )
        sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))

        # Break times are attached to appropriate elements
        self.assertEqual(
            sentences,
            [
                Sentence(
                    idx=0,
                    text="Break here",
                    text_with_ws="Break here",
                    text_spoken="Break here",
                    pause_before_ms=((1 + 2) * 1000),
                    pause_after_ms=((5 + 6) * 1000),
                    words=[
                        Word(
                            idx=0,
                            sent_idx=0,
                            text="Break",
                            text_with_ws="Break",
                            pause_before_ms=(3 * 1000),
                            pause_after_ms=(4 * 1000),
                        ),
                        Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
                    ],
                ),
            ],
        )

    def test_mark(self):
        """Test SSML mark tag"""
        processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
        graph, root = processor(
            """
        <speak>
          <mark name="a"/>
          <p>
            <mark name="b" />
            <s>
              <mark name="c" />
              Mark <mark name="d" /> here
            </s>
            <mark name="e" />
          </p>
          <mark name="f" />
        </speak>
        """,
            ssml=True,
        )
        sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))

        # Mark names are attached to appropriate elements
        self.assertEqual(
            sentences,
            [
                Sentence(
                    idx=0,
                    text="Mark here",
                    text_with_ws="Mark here",
                    text_spoken="Mark here",
                    marks_before=["a", "b"],
                    marks_after=["e", "f"],
                    words=[
                        Word(
                            idx=0,
                            sent_idx=0,
                            text="Mark",
                            text_with_ws="Mark",
                            marks_before=["c"],
                            marks_after=["d"],
                        ),
                        Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
                    ],
                ),
            ],
        )

    def test_missing_speak(self):
        """Test SSML with missing <speak> tag"""
        processor = TextProcessor()
        graph, root = processor("<s>hello</s><s>world</s>", ssml=True,)
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # <speak> is automatically added when XML fails to parse
        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, text="hello", text_with_ws="hello",),
                Word(idx=0, sent_idx=1, text="world", text_with_ws="world",),
            ],
        )

    def test_adjacent_voice(self):
        """Test SSML with adjacent <voice> tags"""
        processor = TextProcessor()
        graph, root = processor(
            '<voice name="a">hello.</voice><voice name="b">world.</voice>', ssml=True,
        )
        words = list(processor.words(graph, root, major_breaks=False, **WORDS_KWARGS))

        self.assertEqual(
            words,
            [
                Word(idx=0, sent_idx=0, voice="a", text="hello", text_with_ws="hello",),
                Word(idx=0, sent_idx=1, voice="b", text="world", text_with_ws="world",),
            ],
        )

    def test_multiple_passes(self):
        """Test sentence that needs multiple passes to fully resolve"""
        processor = TextProcessor()
        graph, root = processor("ABCD-10")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # 1) ABCD-10 -> ABCD 10
        # 2) ABCD 10 -> A B C D ten
        self.assertEqual(
            words,
            [
                Word(idx=0, text="A", text_with_ws="A ",),
                Word(idx=1, text="B", text_with_ws="B ",),
                Word(idx=2, text="C", text_with_ws="C ",),
                Word(idx=3, text="D", text_with_ws="D ",),
                Word(idx=4, text="ten", text_with_ws="ten",),
            ],
        )

    def test_number_nonfinite(self):
        """Test sentence with nan or inf"""
        processor = TextProcessor()
        graph, root = processor("nan inf")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Words should not be parsed as numbers
        self.assertEqual(
            words,
            [
                Word(idx=0, text="nan", text_with_ws="nan ",),
                Word(idx=1, text="inf", text_with_ws="inf",),
            ],
        )

    def test_override_initialism(self):
        """Test use of inline lexicon pronunciation to override an initialism"""
        processor = TextProcessor()
        graph, root = processor("ROOFUS")
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Word is interpreted as initialism
        self.assertEqual(
            words,
            [
                Word(idx=0, text="R", text_with_ws="R ",),
                Word(idx=1, text="O", text_with_ws="O ",),
                Word(idx=2, text="O", text_with_ws="O ",),
                Word(idx=3, text="F", text_with_ws="F ",),
                Word(idx=4, text="U", text_with_ws="U ",),
                Word(idx=5, text="S", text_with_ws="S",),
            ],
        )

        graph, root = processor(
            """
        <speak>
          <lexicon>
            <lexeme>
              <grapheme>ROOFUS</grapheme>
              <phoneme>ɹ ˈu f ə s</phoneme>
            </lexeme>
          </lexicon>
          <s>ROOFUS</s>
        </speak>""",
            ssml=True,
        )
        words = list(processor.words(graph, root, **WORDS_KWARGS))

        # Word is *not* interpreted as initialism
        self.assertEqual(
            words, [Word(idx=0, text="ROOFUS", text_with_ws="ROOFUS",)],
        )


def print_graph_stderr(graph, root):
    """Print graph to stderr"""
    print_graph(graph, root, print_func=lambda *p: print(*p, file=sys.stderr))


# -----------------------------------------------------------------------------

if __name__ == "__main__":
    unittest.main()
Back to Directory File Manager