From 478a77a24317206e048eed3d1b55ae005eefdbda Mon Sep 17 00:00:00 2001 From: Anirban Bhowmick Date: Mon, 11 Aug 2025 11:24:07 +0200 Subject: [PATCH 1/6] added German language support for Flesch and new German readability metric wiener-sachtextformel --- .gitignore | 3 +- readability/readability.py | 12 ++- readability/scorers/__init__.py | 7 +- readability/scorers/flesch.py | 105 +++++++++++++------ readability/scorers/wiener_sachtextformel.py | 54 ++++++++++ readability/text/analyzer.py | 14 +++ test/test_readability.py | 23 ++++ 7 files changed, 179 insertions(+), 39 deletions(-) create mode 100644 readability/scorers/wiener_sachtextformel.py diff --git a/.gitignore b/.gitignore index c65a17d..b8da8a2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ __pycache__ .vscode py_readability_metrics.egg-info dist -build \ No newline at end of file +build +venv \ No newline at end of file diff --git a/readability/readability.py b/readability/readability.py index 3a48d42..ea5d040 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,13 +1,16 @@ from .text import Analyzer from .scorers import ARI, ColemanLiau, DaleChall, Flesch, \ - FleschKincaid, GunningFog, LinsearWrite, Smog, Spache + FleschKincaid, GunningFog, LinsearWrite, Smog, Spache, WienerSachtextformel import warnings +import nltk +nltk.download('punkt_tab') class Readability: - def __init__(self, text, min_words=100): + def __init__(self, text, min_words=100, language='en'): self._analyzer = Analyzer() self._statistics = self._analyzer.analyze(text) self._min_words = min_words + self._language = language if self._min_words < 100: warnings.warn( "Documents with fewer than 100 words may affect the accuracy of readability tests" @@ -27,7 +30,7 @@ def dale_chall(self): def flesch(self): """Calculate Flesch Reading Ease score.""" - return Flesch(self._statistics, self._min_words).score() + return Flesch(self._statistics, self._min_words, self._language).score() def flesch_kincaid(self): """Calculate Flesch-Kincaid Grade Level.""" @@ -46,6 +49,9 @@ def smog(self,all_sentences=False, ignore_length=False): `all_sentences` indicates whether SMOG should use a sample of 30 sentences, as described in the original paper, or if it should use all sentences in the text""" return Smog(self._statistics, self._analyzer.sentences, all_sentences=all_sentences, ignore_length=ignore_length).score() + def wiener_sachtextformel(self): + """Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).score() def spache(self): """Spache Index.""" diff --git a/readability/scorers/__init__.py b/readability/scorers/__init__.py index df708e8..f4a413c 100644 --- a/readability/scorers/__init__.py +++ b/readability/scorers/__init__.py @@ -1,10 +1,11 @@ +from .ari import ARI +from .coleman_liau import ColemanLiau +from .dale_chall import DaleChall from .flesch import Flesch from .flesch_kincaid import FleschKincaid from .gunning_fog import GunningFog -from .coleman_liau import ColemanLiau -from .dale_chall import DaleChall -from .ari import ARI from .linsear_write import LinsearWrite from .smog import Smog from .spache import Spache +from .wiener_sachtextformel import WienerSachtextformel diff --git a/readability/scorers/flesch.py b/readability/scorers/flesch.py index 7d35cc0..c8dbd99 100644 --- a/readability/scorers/flesch.py +++ b/readability/scorers/flesch.py @@ -13,8 +13,9 @@ def __str__(self): class Flesch: - def __init__(self, stats, min_words=100): + def __init__(self, stats, min_words=100, language='en'): self._stats = stats + self._language = language if stats.num_words < min_words: raise ReadabilityException('{} words required.'.format(min_words)) @@ -27,38 +28,78 @@ def score(self): def _score(self): stats = self._stats - words_per_sent = stats.num_words / stats.num_sentences - syllables_per_word = stats.num_syllables / stats.num_words - return 206.835 - (1.015 * words_per_sent) - (84.6 * syllables_per_word) + if self._language == 'en': + words_per_sent = stats.num_words / stats.num_sentences + syllables_per_word = stats.num_syllables / stats.num_words + return 206.835 - (1.015 * words_per_sent) - (84.6 * syllables_per_word) + elif self._language == 'de': + words_per_sent = stats.num_words / stats.num_sentences + syllables_per_word = stats.num_syllables / stats.num_words + return 180 - words_per_sent - (58.5 * syllables_per_word) + else: + raise ReadabilityException('Unsupported language: {}'.format(self._language)) + def _ease(self, score): - if score >= 90 and score <= 100: - return 'very_easy' - elif score >= 80 and score < 90: - return 'easy' - elif score >= 70 and score < 80: - return 'fairly_easy' - elif score >= 60 and score < 70: - return 'standard' - elif score >= 50 and score < 60: - return 'fairly_difficult' - elif score >= 30 and score < 50: - return 'difficult' - else: - return 'very_confusing' + if self._language == 'en': + if score >= 90 and score <= 100: + return 'very_easy' + elif score >= 80 and score < 90: + return 'easy' + elif score >= 70 and score < 80: + return 'fairly_easy' + elif score >= 60 and score < 70: + return 'standard' + elif score >= 50 and score < 60: + return 'fairly_difficult' + elif score >= 30 and score < 50: + return 'difficult' + else: + return 'very_confusing' + elif self._language == 'de': + if score >= 90 and score <= 100: + return 'sehr_leicht' + elif score >= 80 and score < 90: + return 'leicht' + elif score >= 70 and score < 80: + return 'mittel_leicht' + elif score >= 60 and score < 70: + return 'mittel' + elif score >= 50 and score < 60: + return 'mittel_schwer' + elif score >= 30 and score < 50: + return 'schwer' + else: + return 'sehr_schwer' def _grade_levels(self, score): - if score >= 90 and score <= 100: - return ['5'] - elif score >= 80 and score < 90: - return ['6'] - elif score >= 70 and score < 80: - return ['7'] - elif score >= 60 and score < 70: - return ['8', '9'] - elif score >= 50 and score < 60: - return ['10', '11', '12'] - elif score >= 30 and score < 50: - return ['college'] - else: - return ['college_graduate'] + if self._language == 'en': + if score >= 90 and score <= 100: + return ['5'] + elif score >= 80 and score < 90: + return ['6'] + elif score >= 70 and score < 80: + return ['7'] + elif score >= 60 and score < 70: + return ['8', '9'] + elif score >= 50 and score < 60: + return ['10', '11', '12'] + elif score >= 30 and score < 50: + return ['college'] + else: + return ['college_graduate'] + elif self._language == 'de': + if score >= 90 and score <= 100: + return ['11-jährige Schülerinnen und Schüler'] + elif score >= 80 and score < 90: + return ['11-12 jährige Schülerinnen und Schüler'] + elif score >= 70 and score < 80: + return ['11-12 jährige Schülerinnen und Schüler'] + elif score >= 60 and score < 70: + return ['13-15 jährige Schülerinnen und Schüler'] + elif score >= 50 and score < 60: + return ['13-15 jährige Schülerinnen und Schüler'] + elif score >= 30 and score < 50: + return ['13-15 jährige Schülerinnen und Schüler'] + else: + return ['Akademikerinnen und Akademiker'] diff --git a/readability/scorers/wiener_sachtextformel.py b/readability/scorers/wiener_sachtextformel.py new file mode 100644 index 0000000..b01e702 --- /dev/null +++ b/readability/scorers/wiener_sachtextformel.py @@ -0,0 +1,54 @@ +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score, grade_level): + self.score = score + self.grade_level = grade_level + + def __str__(self): + return "score: {}, grade_level: '{}'". \ + format(self.score, self.grade_level) + + +class WienerSachtextformel: + def __init__(self, stats, min_words=100, language='de'): + self._stats = stats + if stats.num_words < min_words: + raise ReadabilityException('{} words required.'.format(min_words)) + + def score(self): + score = self._score() + return Result( + score=score, + grade_level=self._grade_level(score) + ) + + def _score(self): + stats = self._stats + return (0.1935 * (stats.num_poly_syllable_words / stats.num_words)*100) + (0.1672 * stats.num_words) + \ + (0.1297 * (stats.num_six_letter_words / stats.num_words)*100) - (0.0327 * (stats.num_mono_syllable_words / stats.num_words)*100) - 15.59 + + def _ease(self, score): + if score >= 4 and score <= 5: + return 'very_easy' + elif score >=6 and score <=7: + return 'easy' + elif score >=8 and score <=10: + return 'average' + elif score >=11 and score <=12: + return 'difficult' + else: + return 'very_difficult' + + def _grade_level(self, score): + if score >= 4 and score <= 5: + return ['4th-5th grade'] + elif score >=6 and score <=7: + return ['6th-7th grade'] + elif score >=8 and score <=10: + return ['8th-10th grade'] + elif score >=11 and score <=12: + return ['11th-12th grade'] + else: + return ['college level and above'] diff --git a/readability/text/analyzer.py b/readability/text/analyzer.py index dce409e..905ed14 100644 --- a/readability/text/analyzer.py +++ b/readability/text/analyzer.py @@ -40,6 +40,14 @@ def num_dale_chall_complex(self): @property def num_spache_complex(self): return self.stats['num_spache_complex'] + + @property + def num_mono_syllable_words(self): + return self.stats['num_mono_syllable_words'] + + @property + def num_six_letter_words(self): + return self.stats['num_six_letter_words'] @property def avg_words_per_sentence(self): @@ -75,6 +83,8 @@ def _statistics(self, text): gunning_complex_count = 0 dale_chall_complex_count = 0 spache_complex_count = 0 + mono_syllable_count = 0 + six_letter_word_count = 0 porter_stemmer = PorterStemmer() def is_gunning_complex(t, syllable_count): @@ -97,7 +107,9 @@ def is_spache_complex(t): word_syllable_count = count_syllables(t) syllable_count += word_syllable_count letters_count += len(t) + six_letter_word_count += 1 if len(t) >= 6 else 0 poly_syllable_count += 1 if word_syllable_count >= 3 else 0 + mono_syllable_count += 1 if word_syllable_count == 1 else 0 gunning_complex_count += \ 1 if is_gunning_complex(t, word_syllable_count) \ else 0 @@ -119,6 +131,8 @@ def is_spache_complex(t): 'num_dale_chall_complex': dale_chall_complex_count, 'num_spache_complex': spache_complex_count, 'sentences': sentences, + 'num_mono_syllable_words': mono_syllable_count, + 'num_six_letter_words': six_letter_word_count, } def _tokenize_sentences(self, text): diff --git a/test/test_readability.py b/test/test_readability.py index 46e0d1b..54b3643 100644 --- a/test/test_readability.py +++ b/test/test_readability.py @@ -1,4 +1,5 @@ import unittest + from readability import Readability @@ -90,3 +91,25 @@ def test_print_stats(self): self.assertEqual(117, stats['num_words']) self.assertEqual(7, stats['num_sentences']) self.assertEqual(20, stats['num_polysyllabic_words']) + +class ReadabilityTestGerman(unittest.TestCase): + def setUp(self): + german_text = """ + In der Linguistik ist der Gunning-Fog-Index ein Lesbarkeitsindex für englische Texte. Der Index schätzt die Jahre formaler Bildung, die eine Person benötigt, um den Text beim ersten Lesen zu verstehen. Ein Fog-Index von 12 erfordert beispielsweise das Leseverständnis eines Schülers der letzten Klasse einer amerikanischen High School (etwa 18 Jahre alt). Der Test wurde 1952 von Robert Gunning, einem amerikanischen Geschäftsmann, der in Zeitungs- und Lehrbuchverlagen tätig war, entwickelt. + Der Fog-Index wird häufig verwendet, um zu bestätigen, dass ein Text für die beabsichtigte Zielgruppe leicht lesbar ist. Texte für ein breites Publikum sollten in der Regel einen Fog-Index von weniger als 12 haben. Texte, die ein nahezu universelles Verständnis erfordern, sollten einen Index von weniger als 8 haben. + """ + self.readability = Readability(german_text, language='de') + + def test_flesch_german(self): + r = self.readability.flesch() + print(r) + self.assertGreaterEqual(r.score, 60 ) + self.assertEqual(['13-15 jährige Schülerinnen und Schüler'], r.grade_levels) + self.assertEqual('mittel', r.ease) + + def test_wiener_sachtextformel_german(self): + r = self.readability.wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) \ No newline at end of file From d1f048188db77877b17dbc1cc55fbcc9d05b3dd9 Mon Sep 17 00:00:00 2001 From: Anirbanbhk88 Date: Tue, 12 Aug 2025 15:06:52 +0200 Subject: [PATCH 2/6] added more readability scores for German --- readability/readability.py | 39 +++++++++-- readability/scorers/__init__.py | 2 + readability/scorers/lix.py | 57 +++++++++++++++ .../scorers/miyazaki_readability_index.py | 70 +++++++++++++++++++ readability/scorers/wiener_sachtextformel.py | 70 +++++++++++++++++-- readability/text/analyzer.py | 17 ++++- test/test_readability.py | 41 ++++++++++- 7 files changed, 279 insertions(+), 17 deletions(-) create mode 100644 readability/scorers/lix.py create mode 100644 readability/scorers/miyazaki_readability_index.py diff --git a/readability/readability.py b/readability/readability.py index ea5d040..86464ef 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,8 +1,13 @@ -from .text import Analyzer -from .scorers import ARI, ColemanLiau, DaleChall, Flesch, \ - FleschKincaid, GunningFog, LinsearWrite, Smog, Spache, WienerSachtextformel import warnings + import nltk + +from .scorers import (ARI, ColemanLiau, DaleChall, Flesch, FleschKincaid, + GunningFog, LinsearWrite, LixLesbarkeitsIndex, + MiyazakiReadabilityIndex, Smog, Spache, + WienerSachtextformel) +from .text import Analyzer + nltk.download('punkt_tab') class Readability: @@ -49,9 +54,29 @@ def smog(self,all_sentences=False, ignore_length=False): `all_sentences` indicates whether SMOG should use a sample of 30 sentences, as described in the original paper, or if it should use all sentences in the text""" return Smog(self._statistics, self._analyzer.sentences, all_sentences=all_sentences, ignore_length=ignore_length).score() - def wiener_sachtextformel(self): - """Wiener Sachtextformel.""" - return WienerSachtextformel(self._statistics, self._min_words).score() + def erste_wiener_sachtextformel(self): + """erste Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).erste_wiener_sachtextformel_score() + + def zweite_wiener_sachtextformel(self): + """zweite Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).zweite_wiener_sachtextformel_score() + + def dritte_wiener_sachtextformel(self): + """dritte Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).dritte_wiener_sachtextformel_score() + + def vierte_wiener_sachtextformel(self): + """vierte Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).vierte_wiener_sachtextformel_score() + + def lix_lesbarkeits_index(self): + """LIX Lesbarkeitsindex.""" + return LixLesbarkeitsIndex(self._statistics, self._min_words).score() + + def miyazaki_readability_index(self): + """Miyazaki Readability Index.""" + return MiyazakiReadabilityIndex(self._statistics, self._min_words).score() def spache(self): """Spache Index.""" @@ -65,4 +90,6 @@ def statistics(self): 'num_polysyllabic_words': self._statistics.num_poly_syllable_words, 'avg_words_per_sentence': self._statistics.avg_words_per_sentence, 'avg_syllables_per_word': self._statistics.avg_syllables_per_word, + 'num_six_letter_words': self._statistics.num_six_letter_words, + 'num_mono_syllable_words': self._statistics.num_mono_syllable_words, } diff --git a/readability/scorers/__init__.py b/readability/scorers/__init__.py index f4a413c..6ec053c 100644 --- a/readability/scorers/__init__.py +++ b/readability/scorers/__init__.py @@ -9,3 +9,5 @@ from .smog import Smog from .spache import Spache from .wiener_sachtextformel import WienerSachtextformel +from .lix import LixLesbarkeitsIndex +from .miyazaki_readability_index import MiyazakiReadabilityIndex diff --git a/readability/scorers/lix.py b/readability/scorers/lix.py new file mode 100644 index 0000000..e4ea0ca --- /dev/null +++ b/readability/scorers/lix.py @@ -0,0 +1,57 @@ +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score): + self.score = score + + def __str__(self): + return "score: {}".format(self.score) + + + +class LixLesbarkeitsIndex: + def __init__(self, stats, min_words=100, language='de'): + self._stats = stats + if stats.num_words < min_words: + raise ReadabilityException('{} words required.'.format(min_words)) + + def score(self): + score = self._score() + return Result( + score=score + ) + + def _score(self): + """ + Calculates the Lix readability index + :param avg_words_per_sentence: mean sentence length + :param ratio_long_words: ratio of words with six or more characters + :return: Lix index + """ + stats = self._stats + return stats.avg_words_per_sentence + stats.avg_num_six_letter_words + + def _ease(self, score): + if score >= 4 and score <= 5: + return 'very_easy' + elif score >=6 and score <=7: + return 'easy' + elif score >=8 and score <=10: + return 'average' + elif score >=11 and score <=12: + return 'difficult' + else: + return 'very_difficult' + + def _grade_level(self, score): + if score >= 4 and score <= 5: + return ['4th-5th grade'] + elif score >=6 and score <=7: + return ['6th-7th grade'] + elif score >=8 and score <=10: + return ['8th-10th grade'] + elif score >=11 and score <=12: + return ['11th-12th grade'] + else: + return ['college level and above'] diff --git a/readability/scorers/miyazaki_readability_index.py b/readability/scorers/miyazaki_readability_index.py new file mode 100644 index 0000000..05b9769 --- /dev/null +++ b/readability/scorers/miyazaki_readability_index.py @@ -0,0 +1,70 @@ +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score): + self.score = score + + def __str__(self): + return "score: {}".format(self.score) + + + +class MiyazakiReadabilityIndex: + def __init__(self, stats, min_words=100, language='de'): + self._stats = stats + if stats.num_words < min_words: + raise ReadabilityException('{} words required.'.format(min_words)) + + def score(self): + score = self._score() + return Result( + score=score + ) + + def _score(self): + """ + Calculates the Miyazaki English as a Foreign Language Readability Index by Greenfiel 1999 + It is parametrized for Japanes L2 speakers of English, who are students and read academic texts. + Average score of 50, ranges between 100 and minus infinity + + Formula: 164.935 - 18.792 * word_length - 1.916 * sentence_length + + :param word_length: average word length in characters + :param sentence_length: average sentence length in words + :return: ML2RI + """ + stats = self._stats + return 164.935 - 18.792 * stats.num_letters - 1.916 * stats.num_words + + def _ease(self, score): + if score >= 91 and score <= 100: + return 'very_easy' + elif score >= 81 and score <= 90: + return 'easy' + elif score >= 71 and score <= 80: + return 'Fairly easy' + elif score >= 61 and score <= 70: + return 'standard' + elif score >= 51 and score <= 60: + return 'fairly difficult' + elif score >= 31 and score <= 50: + return 'difficult' + elif score < 31: + return 'very_difficult' + + def _grade_level(self, score): + if score >= 91 and score <= 100: + return ['5th grade'] + elif score >= 81 and score <= 90: + return ['6th grade'] + elif score >= 71 and score <= 80: + return ['7th grade'] + elif score >= 61 and score <= 70: + return ['8th - 9th grade'] + elif score >= 51 and score <= 60: + return ['10th - 12th grade'] + elif score >= 31 and score <= 50: + return ['post-school/college level'] + elif score < 31: + return ['university graduate'] \ No newline at end of file diff --git a/readability/scorers/wiener_sachtextformel.py b/readability/scorers/wiener_sachtextformel.py index b01e702..8eb79b7 100644 --- a/readability/scorers/wiener_sachtextformel.py +++ b/readability/scorers/wiener_sachtextformel.py @@ -17,17 +17,77 @@ def __init__(self, stats, min_words=100, language='de'): if stats.num_words < min_words: raise ReadabilityException('{} words required.'.format(min_words)) - def score(self): - score = self._score() + def erste_wiener_sachtextformel_score(self): + score = self._erste_wiener_sachtextformel_score() return Result( score=score, grade_level=self._grade_level(score) ) + + def zweite_wiener_sachtextformel_score(self): + score = self._zweite_wiener_sachtextformel_score() + return Result( + score=score, + grade_level=self._grade_level(score) + ) + + def dritte_wiener_sachtextformel_score(self): + score = self._dritte_wiener_sachtextformel_score() + return Result( + score=score, + grade_level=self._grade_level(score) + ) + + def vierte_wiener_sachtextformel_score(self): + score = self._vierte_wiener_sachtextformel_score() + return Result( + score=score, + grade_level=self._grade_level(score) + ) + + def _erste_wiener_sachtextformel_score(self): + """ + The first Wiener Sachtextformel + + The formula is: + 0.1935 * ratio of words with >= 3 syllables + 0.1672 * mean sentence length + + 0.1297 * ratio of words with >= 6 letters - 0.0327 * ratio of words with 1 syllable - 0.875 + """ + stats = self._stats + return (0.1935 * (stats.num_poly_syllable_words / stats.num_words)) + (0.1672 * stats.avg_words_per_sentence) + \ + (0.1297 * (stats.num_six_letter_words / stats.num_words)) - (0.0327 * (stats.num_mono_syllable_words / stats.num_words)*100) - 0.875 + + def _zweite_wiener_sachtextformel_score(self): + """ + The second Wiener Sachtextformel + + The formula is: + 0.2007 * ratio of words with >= 3 syllables + 0.1682 * mean sentence length + + 0.1373 * ratio of words with >= 6 letters - 2.779 + """ + stats = self._stats + return (0.2007 * (stats.num_poly_syllable_words / stats.num_words)) + (0.1682 * stats.avg_words_per_sentence) + \ + (0.1373 * (stats.num_six_letter_words / stats.num_words)) - 2.779 + + def _dritte_wiener_sachtextformel_score(self): + """ + The third Wiener Sachtextformel + + The formula is: + 0.2963 * ratio of words with >= 3 syllables + 0.1905 * mean sentence length - 1.1144 + """ + stats = self._stats + return (0.2963 * (stats.num_poly_syllable_words / stats.num_words)) + (0.1905 * stats.avg_words_per_sentence) - 1.1144 + + def _vierte_wiener_sachtextformel_score(self): + """ + The fourth Wiener Sachtextformel - def _score(self): + The formula is: + 0.2744 * ratio of words with >= 3 syllables + 0.2656 * mean sentence length - 1.693 + """ stats = self._stats - return (0.1935 * (stats.num_poly_syllable_words / stats.num_words)*100) + (0.1672 * stats.num_words) + \ - (0.1297 * (stats.num_six_letter_words / stats.num_words)*100) - (0.0327 * (stats.num_mono_syllable_words / stats.num_words)*100) - 15.59 + return (0.2744 * (stats.num_poly_syllable_words / stats.num_words)) + (0.2656 * stats.avg_words_per_sentence) - 1.693 def _ease(self, score): if score >= 4 and score <= 5: diff --git a/readability/text/analyzer.py b/readability/text/analyzer.py index 905ed14..a0c205f 100644 --- a/readability/text/analyzer.py +++ b/readability/text/analyzer.py @@ -1,8 +1,10 @@ import os import re -from .syllables import count as count_syllables -from nltk.tokenize import sent_tokenize, TweetTokenizer + from nltk.stem.porter import PorterStemmer +from nltk.tokenize import TweetTokenizer, sent_tokenize + +from .syllables import count as count_syllables class AnalyzerStatistics: @@ -48,7 +50,11 @@ def num_mono_syllable_words(self): @property def num_six_letter_words(self): return self.stats['num_six_letter_words'] - + + @property + def avg_num_six_letter_words(self): + return self.stats['num_six_letter_words'] / self.stats['num_words'] if self.stats['num_words'] > 0 else 0 + @property def avg_words_per_sentence(self): return self.num_words / self.num_sentences @@ -133,6 +139,7 @@ def is_spache_complex(t): 'sentences': sentences, 'num_mono_syllable_words': mono_syllable_count, 'num_six_letter_words': six_letter_word_count, + 'avg_words_per_sentence': word_count / sentence_count if sentence_count > 0 else 0, } def _tokenize_sentences(self, text): @@ -167,3 +174,7 @@ def _load_spache(self): spache_path = os.path.join(cur_path, '..', 'data', file) with open(spache_path) as f: return set(line.strip() for line in f) + with open(spache_path) as f: + return set(line.strip() for line in f) + with open(spache_path) as f: + return set(line.strip() for line in f) diff --git a/test/test_readability.py b/test/test_readability.py index 54b3643..3b89c49 100644 --- a/test/test_readability.py +++ b/test/test_readability.py @@ -107,9 +107,44 @@ def test_flesch_german(self): self.assertEqual(['13-15 jährige Schülerinnen und Schüler'], r.grade_levels) self.assertEqual('mittel', r.ease) - def test_wiener_sachtextformel_german(self): - r = self.readability.wiener_sachtextformel() + def test_erste_wiener_sachtextformel_german(self): + r = self.readability.erste_wiener_sachtextformel() print(r) self.assertGreaterEqual( r.score, 11.0) self.assertLessEqual(r.score, 12.0) - self.assertEqual(r.grade_level, ['11th-12th grade']) \ No newline at end of file + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_zweite_wiener_sachtextformel_german(self): + r = self.readability.zweite_wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_dritte_wiener_sachtextformel_german(self): + r = self.readability.dritte_wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_vierte_wiener_sachtextformel_german(self): + r = self.readability.vierte_wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_lix_lesbarkeits_index_german(self): + r = self.readability.lix_lesbarkeits_index() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_miyazaki_readability_index_german(self): + r = self.readability.miyazaki_readability_index() + print(r) + self.assertGreaterEqual( r.score, 31.0) + self.assertLessEqual(r.score, 50.0) + self.assertEqual(r.grade_level, ['post-school/college level']) From caf721df97e2c1b2eeb6c29ee110aa14dea54d9c Mon Sep 17 00:00:00 2001 From: Anirbanbhk88 Date: Thu, 14 Aug 2025 12:56:44 +0200 Subject: [PATCH 3/6] modified method names in german readability metrics --- readability/scorers/lix.py | 13 +++++++--- .../scorers/miyazaki_readability_index.py | 13 +++++++--- readability/scorers/wiener_sachtextformel.py | 25 +++++++++++-------- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/readability/scorers/lix.py b/readability/scorers/lix.py index e4ea0ca..edfc2ba 100644 --- a/readability/scorers/lix.py +++ b/readability/scorers/lix.py @@ -2,11 +2,14 @@ class Result: - def __init__(self, score): + def __init__(self, score, grade_levels, ease): self.score = score + self.ease = ease + self.grade_levels = grade_levels def __str__(self): - return "score: {}".format(self.score) + return "score: {}, ease: '{}', grade_levels: {}". \ + format(self.score, self.ease, self.grade_levels) @@ -19,7 +22,9 @@ def __init__(self, stats, min_words=100, language='de'): def score(self): score = self._score() return Result( - score=score + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) ) def _score(self): @@ -44,7 +49,7 @@ def _ease(self, score): else: return 'very_difficult' - def _grade_level(self, score): + def _grade_levels(self, score): if score >= 4 and score <= 5: return ['4th-5th grade'] elif score >=6 and score <=7: diff --git a/readability/scorers/miyazaki_readability_index.py b/readability/scorers/miyazaki_readability_index.py index 05b9769..cdc7f4e 100644 --- a/readability/scorers/miyazaki_readability_index.py +++ b/readability/scorers/miyazaki_readability_index.py @@ -2,11 +2,14 @@ class Result: - def __init__(self, score): + def __init__(self, score, grade_levels, ease): self.score = score + self.ease = ease + self.grade_levels = grade_levels def __str__(self): - return "score: {}".format(self.score) + return "score: {}, ease: '{}', grade_levels: {}". \ + format(self.score, self.ease, self.grade_levels) @@ -19,7 +22,9 @@ def __init__(self, stats, min_words=100, language='de'): def score(self): score = self._score() return Result( - score=score + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) ) def _score(self): @@ -53,7 +58,7 @@ def _ease(self, score): elif score < 31: return 'very_difficult' - def _grade_level(self, score): + def _grade_levels(self, score): if score >= 91 and score <= 100: return ['5th grade'] elif score >= 81 and score <= 90: diff --git a/readability/scorers/wiener_sachtextformel.py b/readability/scorers/wiener_sachtextformel.py index 8eb79b7..6553eb6 100644 --- a/readability/scorers/wiener_sachtextformel.py +++ b/readability/scorers/wiener_sachtextformel.py @@ -1,14 +1,15 @@ from readability.exceptions import ReadabilityException - + class Result: - def __init__(self, score, grade_level): + def __init__(self, score, grade_levels, ease): self.score = score - self.grade_level = grade_level + self.ease = ease + self.grade_levels = grade_levels def __str__(self): - return "score: {}, grade_level: '{}'". \ - format(self.score, self.grade_level) + return "score: {}, ease: '{}', grade_levels: {}". \ + format(self.score, self.ease, self.grade_levels) class WienerSachtextformel: @@ -21,28 +22,32 @@ def erste_wiener_sachtextformel_score(self): score = self._erste_wiener_sachtextformel_score() return Result( score=score, - grade_level=self._grade_level(score) + ease=self._ease(score), + grade_levels=self._grade_levels(score) ) def zweite_wiener_sachtextformel_score(self): score = self._zweite_wiener_sachtextformel_score() return Result( score=score, - grade_level=self._grade_level(score) + ease=self._ease(score), + grade_levels=self._grade_levels(score) ) def dritte_wiener_sachtextformel_score(self): score = self._dritte_wiener_sachtextformel_score() return Result( score=score, - grade_level=self._grade_level(score) + ease=self._ease(score), + grade_levels=self._grade_levels(score) ) def vierte_wiener_sachtextformel_score(self): score = self._vierte_wiener_sachtextformel_score() return Result( score=score, - grade_level=self._grade_level(score) + ease=self._ease(score), + grade_levels=self._grade_levels(score) ) def _erste_wiener_sachtextformel_score(self): @@ -101,7 +106,7 @@ def _ease(self, score): else: return 'very_difficult' - def _grade_level(self, score): + def _grade_levels(self, score): if score >= 4 and score <= 5: return ['4th-5th grade'] elif score >=6 and score <=7: From 5014554520d0f9b261add770ce7651f5712f0139 Mon Sep 17 00:00:00 2001 From: Anirban Bhowmick Date: Tue, 19 Aug 2025 11:13:27 +0200 Subject: [PATCH 4/6] added score description for wiener sachstext formel --- readability/scorers/lix.py | 2 +- readability/scorers/miyazaki_readability_index.py | 2 +- readability/scorers/wiener_sachtextformel.py | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/readability/scorers/lix.py b/readability/scorers/lix.py index edfc2ba..e1a274a 100644 --- a/readability/scorers/lix.py +++ b/readability/scorers/lix.py @@ -14,7 +14,7 @@ def __str__(self): class LixLesbarkeitsIndex: - def __init__(self, stats, min_words=100, language='de'): + def __init__(self, stats, min_words=100): self._stats = stats if stats.num_words < min_words: raise ReadabilityException('{} words required.'.format(min_words)) diff --git a/readability/scorers/miyazaki_readability_index.py b/readability/scorers/miyazaki_readability_index.py index cdc7f4e..0c64c19 100644 --- a/readability/scorers/miyazaki_readability_index.py +++ b/readability/scorers/miyazaki_readability_index.py @@ -14,7 +14,7 @@ def __str__(self): class MiyazakiReadabilityIndex: - def __init__(self, stats, min_words=100, language='de'): + def __init__(self, stats, min_words=100): self._stats = stats if stats.num_words < min_words: raise ReadabilityException('{} words required.'.format(min_words)) diff --git a/readability/scorers/wiener_sachtextformel.py b/readability/scorers/wiener_sachtextformel.py index 6553eb6..9a88bea 100644 --- a/readability/scorers/wiener_sachtextformel.py +++ b/readability/scorers/wiener_sachtextformel.py @@ -1,6 +1,6 @@ from readability.exceptions import ReadabilityException - + class Result: def __init__(self, score, grade_levels, ease): self.score = score @@ -13,7 +13,7 @@ def __str__(self): class WienerSachtextformel: - def __init__(self, stats, min_words=100, language='de'): + def __init__(self, stats, min_words=100): self._stats = stats if stats.num_words < min_words: raise ReadabilityException('{} words required.'.format(min_words)) @@ -53,6 +53,7 @@ def vierte_wiener_sachtextformel_score(self): def _erste_wiener_sachtextformel_score(self): """ The first Wiener Sachtextformel + WSTF1 considers all four main factors: sentence length, sentence count, proportion of long words, and proportion of monosyllabic words. The formula is: 0.1935 * ratio of words with >= 3 syllables + 0.1672 * mean sentence length + @@ -65,6 +66,7 @@ def _erste_wiener_sachtextformel_score(self): def _zweite_wiener_sachtextformel_score(self): """ The second Wiener Sachtextformel + WSTF2 is similar to WSTF1, but weights the factors slightly differently, omitting the proportion of monosyllabic words. The formula is: 0.2007 * ratio of words with >= 3 syllables + 0.1682 * mean sentence length + @@ -77,6 +79,7 @@ def _zweite_wiener_sachtextformel_score(self): def _dritte_wiener_sachtextformel_score(self): """ The third Wiener Sachtextformel + WSTF3 is the simplest formula because it only takes into account the mean sentence length and the proportion of long words. The formula is: 0.2963 * ratio of words with >= 3 syllables + 0.1905 * mean sentence length - 1.1144 @@ -87,6 +90,7 @@ def _dritte_wiener_sachtextformel_score(self): def _vierte_wiener_sachtextformel_score(self): """ The fourth Wiener Sachtextformel + WSTF4 focuses specifically on readability in relation to school levels, which is why the weighting of sentence length is greater. The formula is: 0.2744 * ratio of words with >= 3 syllables + 0.2656 * mean sentence length - 1.693 From 3e3a32d57c2b57501bf38af5dd69c778d8fac0d3 Mon Sep 17 00:00:00 2001 From: Anirban Bhowmick Date: Tue, 26 Aug 2025 10:40:10 +0200 Subject: [PATCH 5/6] added GSMOG readability score for german text --- readability/readability.py | 7 ++++- readability/scorers/__init__.py | 5 +-- readability/scorers/gsmog.py | 54 +++++++++++++++++++++++++++++++++ test/test_readability.py | 4 +++ 4 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 readability/scorers/gsmog.py diff --git a/readability/readability.py b/readability/readability.py index 86464ef..c8c4d06 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -5,7 +5,7 @@ from .scorers import (ARI, ColemanLiau, DaleChall, Flesch, FleschKincaid, GunningFog, LinsearWrite, LixLesbarkeitsIndex, MiyazakiReadabilityIndex, Smog, Spache, - WienerSachtextformel) + WienerSachtextformel, Gsmog) from .text import Analyzer nltk.download('punkt_tab') @@ -54,6 +54,11 @@ def smog(self,all_sentences=False, ignore_length=False): `all_sentences` indicates whether SMOG should use a sample of 30 sentences, as described in the original paper, or if it should use all sentences in the text""" return Smog(self._statistics, self._analyzer.sentences, all_sentences=all_sentences, ignore_length=ignore_length).score() + + def gsmog(self, ignore_length=False): + """GSMOG Index. Measure the SMOG score adapted for German text""" + return Gsmog(self._statistics, ignore_length=ignore_length).score() + def erste_wiener_sachtextformel(self): """erste Wiener Sachtextformel.""" return WienerSachtextformel(self._statistics, self._min_words).erste_wiener_sachtextformel_score() diff --git a/readability/scorers/__init__.py b/readability/scorers/__init__.py index 6ec053c..c42e794 100644 --- a/readability/scorers/__init__.py +++ b/readability/scorers/__init__.py @@ -4,10 +4,11 @@ from .dale_chall import DaleChall from .flesch import Flesch from .flesch_kincaid import FleschKincaid +from .gsmog import Gsmog from .gunning_fog import GunningFog from .linsear_write import LinsearWrite +from .lix import LixLesbarkeitsIndex +from .miyazaki_readability_index import MiyazakiReadabilityIndex from .smog import Smog from .spache import Spache from .wiener_sachtextformel import WienerSachtextformel -from .lix import LixLesbarkeitsIndex -from .miyazaki_readability_index import MiyazakiReadabilityIndex diff --git a/readability/scorers/gsmog.py b/readability/scorers/gsmog.py new file mode 100644 index 0000000..1733aac --- /dev/null +++ b/readability/scorers/gsmog.py @@ -0,0 +1,54 @@ +import math +import warnings + +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score, grade_level): + self.score = score + self.grade_level = grade_level + + def __str__(self): + return "score: {}, grade_level: {}". \ + format(self.score, self.grade_level) + + +class Gsmog: + def __init__(self, stats, ignore_length=False): + """ + Bamberger adapted McLaughlin's original formula (Harry McLaughlin, 1969 https://ogg.osu.edu/media/documents/health_lit/WRRSMOG_Readability_Formula_G._Harry_McLaughlin__1969_.pdf) + for German-speaking countries. The formula compares the number of multisyllabic words (three or more, see above) to the number of sentences in the entire text. Since the original formula refers to a + sample of 30 sentences, the implementation in this class uses 30 sentences as a default if all_sentences is False. + """ + if stats.num_sentences < 30: + if not ignore_length: + raise ReadabilityException( + 'SMOG requires 30 sentences. {} found' + .format(stats.num_sentences)) + else: + warnings.warn( + 'SMOG requires 30 sentences. {} found' + .format(stats.num_sentences)) + + + self._stats = stats + + + def score(self): + score = self._score() + grade_level = self._grade_level(score) + return Result( + score=score, + grade_level=grade_level + ) + + def _score(self): + + num_sentences = self._stats.num_sentences + num_complex_words = self._stats.num_poly_syllable_words # words with 3 or more syllables + return math.sqrt(30 * num_complex_words / num_sentences) - 2 + + def _grade_level(self, score): + return str(round(score)) + diff --git a/test/test_readability.py b/test/test_readability.py index 3b89c49..0430f36 100644 --- a/test/test_readability.py +++ b/test/test_readability.py @@ -148,3 +148,7 @@ def test_miyazaki_readability_index_german(self): self.assertGreaterEqual( r.score, 31.0) self.assertLessEqual(r.score, 50.0) self.assertEqual(r.grade_level, ['post-school/college level']) + + def test_gsmog_german(self): + r = self.readability.gsmog() + print(r) From fd55d1e1d5a56128968b1612ee65e1c2f48dcd67 Mon Sep 17 00:00:00 2001 From: Anirban Bhowmick Date: Thu, 28 Aug 2025 11:15:24 +0200 Subject: [PATCH 6/6] fixed grade levels for german readability metrics --- readability/scorers/flesch.py | 12 ++++++------ readability/scorers/lix.py | 8 ++++---- readability/scorers/miyazaki_readability_index.py | 10 +++++----- readability/scorers/wiener_sachtextformel.py | 8 ++++---- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/readability/scorers/flesch.py b/readability/scorers/flesch.py index c8dbd99..25cbd3f 100644 --- a/readability/scorers/flesch.py +++ b/readability/scorers/flesch.py @@ -90,16 +90,16 @@ def _grade_levels(self, score): return ['college_graduate'] elif self._language == 'de': if score >= 90 and score <= 100: - return ['11-jährige Schülerinnen und Schüler'] + return ['11'] elif score >= 80 and score < 90: - return ['11-12 jährige Schülerinnen und Schüler'] + return ['11, 12'] elif score >= 70 and score < 80: - return ['11-12 jährige Schülerinnen und Schüler'] + return ['11, 12'] elif score >= 60 and score < 70: - return ['13-15 jährige Schülerinnen und Schüler'] + return ['13, 14, 15'] elif score >= 50 and score < 60: - return ['13-15 jährige Schülerinnen und Schüler'] + return ['13, 14, 15'] elif score >= 30 and score < 50: - return ['13-15 jährige Schülerinnen und Schüler'] + return ['13, 14, 15'] else: return ['Akademikerinnen und Akademiker'] diff --git a/readability/scorers/lix.py b/readability/scorers/lix.py index e1a274a..a547f48 100644 --- a/readability/scorers/lix.py +++ b/readability/scorers/lix.py @@ -51,12 +51,12 @@ def _ease(self, score): def _grade_levels(self, score): if score >= 4 and score <= 5: - return ['4th-5th grade'] + return [4, 5] elif score >=6 and score <=7: - return ['6th-7th grade'] + return [6, 7] elif score >=8 and score <=10: - return ['8th-10th grade'] + return [8, 9, 10] elif score >=11 and score <=12: - return ['11th-12th grade'] + return [11, 12] else: return ['college level and above'] diff --git a/readability/scorers/miyazaki_readability_index.py b/readability/scorers/miyazaki_readability_index.py index 0c64c19..8cf6323 100644 --- a/readability/scorers/miyazaki_readability_index.py +++ b/readability/scorers/miyazaki_readability_index.py @@ -60,15 +60,15 @@ def _ease(self, score): def _grade_levels(self, score): if score >= 91 and score <= 100: - return ['5th grade'] + return ['5'] elif score >= 81 and score <= 90: - return ['6th grade'] + return ['6'] elif score >= 71 and score <= 80: - return ['7th grade'] + return ['7'] elif score >= 61 and score <= 70: - return ['8th - 9th grade'] + return ['8', '9'] elif score >= 51 and score <= 60: - return ['10th - 12th grade'] + return ['10', '11', '12'] elif score >= 31 and score <= 50: return ['post-school/college level'] elif score < 31: diff --git a/readability/scorers/wiener_sachtextformel.py b/readability/scorers/wiener_sachtextformel.py index 9a88bea..6279f27 100644 --- a/readability/scorers/wiener_sachtextformel.py +++ b/readability/scorers/wiener_sachtextformel.py @@ -112,12 +112,12 @@ def _ease(self, score): def _grade_levels(self, score): if score >= 4 and score <= 5: - return ['4th-5th grade'] + return ['4', '5'] elif score >=6 and score <=7: - return ['6th-7th grade'] + return ['6', '7'] elif score >=8 and score <=10: - return ['8th-10th grade'] + return ['8', '9', '10'] elif score >=11 and score <=12: - return ['11th-12th grade'] + return ['11', '12'] else: return ['college level and above']