Viewing File: /home/ubuntu/.local/lib/python3.10/site-packages/tests/test_golden_rules.py

#!/usr/bin/env python3
"""
Test sentence segmentation against the "Golden Rules":
https://github.com/diasks2/pragmatic_segmenter#the-golden-rules
"""
import unittest

from gruut import sentences


class GoldenRulesTestCase(unittest.TestCase):
    """Test golden rules of sentence segmentation"""

    def test_rule_1(self):
        """Simple period to end sentence"""
        self.assertEqual(
            _get_sentences("Hello World. My name is Jonas."),
            ["Hello World.", "My name is Jonas."],
        )

    def test_rule_2(self):
        """Question mark to end sentence"""
        self.assertEqual(
            _get_sentences("What is your name? My name is Jonas."),
            ["What is your name?", "My name is Jonas."],
        )

    def test_rule_3(self):
        """Exclamation point to end sentence"""
        self.assertEqual(
            _get_sentences("There it is! I found it."), ["There it is!", "I found it."],
        )

    def test_rule_4(self):
        """One letter upper case abbreviations"""
        # NOTE: gruut removes the "." from E
        self.assertEqual(
            _get_sentences("My name is Jonas E. Smith."), ["My name is Jonas E Smith."],
        )

    def test_rule_5(self):
        """One letter lower case abbreviations"""
        # NOTE: gruut removes the "." from p
        self.assertEqual(
            _get_sentences("Please turn to p. 55."), ["Please turn to p 55."],
        )

    def test_rule_6(self):
        """Two letter lower case abbreviations in the middle of a sentence"""
        # NOTE: gruut expands abbreviations
        self.assertEqual(
            _get_sentences("Were Jane and co. at the party?"),
            ["Were Jane and company at the party?"],
        )

    def test_rule_7(self):
        """Two letter upper case abbreviations in the middle of a sentence"""
        # NOTE: gruut expands abbreviations
        self.assertEqual(
            _get_sentences("They closed the deal with Pitt, Briggs & Co. at noon."),
            ["They closed the deal with Pitt, Briggs and Company at noon."],
        )

    # def test_rule_8(self):
    #     """Two letter lower case abbreviations at the end of a sentence"""
    #     self.assertEqual(
    #         _get_sentences("Let's ask Jane and co. They should know."),
    #         ["Let's ask Jane and company. They should know."],
    #     )

    # def test_rule_9(self):
    #     """Two letter upper case abbreviations at the end of a sentence"""
    #     self.assertEqual(
    #         _get_sentences("They closed the deal with Pitt, Briggs & Co. It closed yesterday."),
    #         ["They closed the deal with Pitt, Briggs and Company. It closed yesterday."],
    #     )

    def test_rule_10(self):
        """Two letter (prepositive) abbreviations"""
        # NOTE: gruut expands abbreviations
        self.assertEqual(
            _get_sentences("I can see Mt. Fuji from here."),
            ["I can see Mount Fuji from here."],
        )

    # def test_rule_11(self):
    #     """Two letter (prepositive & postpositive) abbreviations"""
    #     self.assertEqual(
    #         _get_sentences("St. Michael's Church is on 5th st. near the light."),
    #         ["Saint Michael's Church is on 5th street near the light."],
    #     )

    def test_rule_12(self):
        """Possesive two letter abbreviations"""
        # NOTE: gruut expands abbreviations
        self.assertEqual(
            _get_sentences("That is JFK Jr.'s book."), ["That is J F K Junior's book."],
        )

    def test_rule_13(self):
        """Multi-period abbreviations in the middle of a sentence"""
        # NOTE: gruut expands abbreviations
        self.assertEqual(
            _get_sentences("I visited the U.S.A. last year."),
            ["I visited the U S A last year."],
        )

    # 14) Multi-period abbreviations at the end of a sentence
    # I live in the E.U. How about you?
    # ["I live in the E.U.", "How about you?"]

    # 15) U.S. as sentence boundary
    # I live in the U.S. How about you?
    # ["I live in the U.S.", "How about you?"]

    # 16) U.S. as non sentence boundary with next word capitalized
    # I work for the U.S. Government in Virginia.
    # ["I work for the U.S. Government in Virginia."]

    def test_rule_17(self):
        """U.S. as non sentence boundary"""
        # NOTE: gruut expands abbreviations
        self.assertEqual(
            _get_sentences("I have lived in the U.S. for 20 years."),
            ["I have lived in the U S for 20 years."],
        )

    # 18) A.M. / P.M. as non sentence boundary and sentence boundary
    # At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.
    # ["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."]

    def test_rule_19(self):
        """Number as non sentence boundary"""
        self.assertEqual(
            _get_sentences("She has $100.00 in her bag."),
            ["She has $100.00 in her bag."],
        )

    # 21) Parenthetical inside sentence
    # He teaches science (He previously worked for 5 years as an engineer.) at the local University.
    # ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]

    # 22) Email addresses
    # Her email is Jane.Doe@example.com. I sent her an email.
    # ["Her email is Jane.Doe@example.com.", "I sent her an email."]

    # 23) Web addresses
    # The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.
    # ["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]

    # 24) Single quotations inside sentence
    # She turned to him, 'This is great.' she said.
    # ["She turned to him, 'This is great.' she said."]

    # 25) Double quotations inside sentence
    # She turned to him, "This is great." she said.
    # ["She turned to him, \"This is great.\" she said."]

    # 26) Double quotations at the end of a sentence
    # She turned to him, "This is great." She held the book out to show him.
    # ["She turned to him, \"This is great.\"", "She held the book out to show him."]

    def test_rule_27(self):
        """Double punctuation (exclamation point)"""
        self.assertEqual(
            _get_sentences("Hello!! Long time no see."),
            ["Hello!!", "Long time no see."],
        )

    def test_rule_28(self):
        """Double punctuation (question mark)"""
        self.assertEqual(
            _get_sentences("Hello?? Who is there?"), ["Hello??", "Who is there?"],
        )

    def test_rule_29(self):
        """Double punctuation (exclamation point / question mark)"""
        self.assertEqual(
            _get_sentences("Hello!? Is that you?"), ["Hello!?", "Is that you?"],
        )

    def test_rule_30(self):
        """Double punctuation (question mark / exclamation point)"""
        self.assertEqual(
            _get_sentences("Hello?! Is that you?"), ["Hello?!", "Is that you?"],
        )

    # 31) List (period followed by parens and no period to end item)
    # 1.) The first item 2.) The second item
    # ["1.) The first item", "2.) The second item"]

    # 32) List (period followed by parens and period to end item)
    # 1.) The first item. 2.) The second item.
    # ["1.) The first item.", "2.) The second item."]

    # 33) List (parens and no period to end item)
    # 1) The first item 2) The second item
    # ["1) The first item", "2) The second item"]

    # 34) List (parens and period to end item)
    # 1) The first item. 2) The second item.
    # ["1) The first item.", "2) The second item."]

    # 35) List (period to mark list and no period to end item)
    # 1. The first item 2. The second item
    # ["1. The first item", "2. The second item"]

    # 36) List (period to mark list and period to end item)
    # 1. The first item. 2. The second item.
    # ["1. The first item.", "2. The second item."]

    # 37) List with bullet
    # • 9. The first item • 10. The second item
    # ["• 9. The first item", "• 10. The second item"]

    # 38) List with hypthen
    # ⁃9. The first item ⁃10. The second item
    # ["⁃9. The first item", "⁃10. The second item"]

    # 39) Alphabetical list
    # a. The first item b. The second item c. The third list item
    # ["a. The first item", "b. The second item", "c. The third list item"]

    def test_rule_40(self):
        """Errant newlines in the middle of sentences (PDF)"""
        self.assertEqual(
            _get_sentences("This is a sentence\ncut off in the middle because pdf."),
            ["This is a sentence cut off in the middle because pdf."],
        )

    def test_rule_41(self):
        """Errant newlines in the middle of sentences"""
        self.assertEqual(
            _get_sentences("It was a cold \nnight in the city."),
            ["It was a cold night in the city."],
        )

    # 42) Lower case list separated by newline
    # features\ncontact manager\nevents, activities\n
    # ["features", "contact manager", "events, activities"]

    # 43) Geo Coordinates
    # You can find it at N°. 1026.253.553. That is where the treasure is.
    # ["You can find it at N°. 1026.253.553.", "That is where the treasure is."]

    # 44) Named entities with an exclamation point
    # She works at Yahoo! in the accounting department.
    # ["She works at Yahoo! in the accounting department."]

    # 45) I as a sentence boundary and I as an abbreviation
    # We make a good team, you and I. Did you see Albert I. Jones yesterday?
    # ["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"]

    # 46) Ellipsis at end of quotation
    # Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”
    # ["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"]

    # 47) Ellipsis with square brackets
    # "Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).
    # ["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."]

    # 48) Ellipsis as sentence boundary (standard ellipsis rules)
    # If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.
    # ["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."]

    # 49) Ellipsis as sentence boundary (non-standard ellipsis rules)
    # I never meant that.... She left the store.
    # ["I never meant that....", "She left the store."]

    # def test_rule_49(self):
    #     """Ellipsis as sentence boundary (non-standard ellipsis rules)"""
    #     self.assertEqual(
    #         _get_sentences("I never meant that.... She left the store."),
    #         ["I never meant that....", "She left the store."],
    #     )

    # 50) Ellipsis as non sentence boundary
    # I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.
    # ["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."]

    # 51) 4-dot ellipsis
    # One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .
    # ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]


def _get_sentences(text):
    return [
        s.text
        for s in sentences(text, verbalize_numbers=False, verbalize_currency=False)
    ]


# -----------------------------------------------------------------------------

if __name__ == "__main__":
    unittest.main()
Back to Directory File Manager