diff options
Diffstat (limited to 'lib/chardet/metadata/languages.py')
-rw-r--r-- | lib/chardet/metadata/languages.py | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/lib/chardet/metadata/languages.py b/lib/chardet/metadata/languages.py new file mode 100644 index 0000000..1d37884 --- /dev/null +++ b/lib/chardet/metadata/languages.py @@ -0,0 +1,351 @@ +""" +Metadata about languages used by our model training code for our +SingleByteCharSetProbers. Could be used for other things in the future. + +This code is based on the language metadata from the uchardet project. +""" + +from string import ascii_letters + +# TODO: Add Ukrainian (KOI8-U) + + +class Language: + """Metadata about a language useful for training models + + :ivar name: The human name for the language, in English. + :type name: str + :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, + or use another catalog as a last resort. + :type iso_code: str + :ivar use_ascii: Whether or not ASCII letters should be included in trained + models. + :type use_ascii: bool + :ivar charsets: The charsets we want to support and create data for. + :type charsets: list of str + :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is + `True`, you only need to add those not in the ASCII set. + :type alphabet: str + :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling + Wikipedia for training data. + :type wiki_start_pages: list of str + """ + + def __init__( + self, + name=None, + iso_code=None, + use_ascii=True, + charsets=None, + alphabet=None, + wiki_start_pages=None, + ): + super().__init__() + self.name = name + self.iso_code = iso_code + self.use_ascii = use_ascii + self.charsets = charsets + if self.use_ascii: + if alphabet: + alphabet += ascii_letters + else: + alphabet = ascii_letters + elif not alphabet: + raise ValueError("Must supply alphabet if use_ascii is False") + self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None + self.wiki_start_pages = wiki_start_pages + + def __repr__(self): + param_str = ", ".join( + f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") + ) + return f"{self.__class__.__name__}({param_str})" + + +LANGUAGES = { + "Arabic": Language( + name="Arabic", + iso_code="ar", + use_ascii=False, + # We only support encodings that use isolated + # forms, because the current recommendation is + # that the rendering system handles presentation + # forms. This means we purposefully skip IBM864. + charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"], + alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", + wiki_start_pages=["الصفحة_الرئيسية"], + ), + "Belarusian": Language( + name="Belarusian", + iso_code="be", + use_ascii=False, + charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"], + alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ", + wiki_start_pages=["Галоўная_старонка"], + ), + "Bulgarian": Language( + name="Bulgarian", + iso_code="bg", + use_ascii=False, + charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"], + alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", + wiki_start_pages=["Начална_страница"], + ), + "Czech": Language( + name="Czech", + iso_code="cz", + use_ascii=True, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ", + wiki_start_pages=["Hlavní_strana"], + ), + "Danish": Language( + name="Danish", + iso_code="da", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + alphabet="æøåÆØÅ", + wiki_start_pages=["Forside"], + ), + "German": Language( + name="German", + iso_code="de", + use_ascii=True, + charsets=["ISO-8859-1", "WINDOWS-1252"], + alphabet="äöüßÄÖÜ", + wiki_start_pages=["Wikipedia:Hauptseite"], + ), + "Greek": Language( + name="Greek", + iso_code="el", + use_ascii=False, + charsets=["ISO-8859-7", "WINDOWS-1253"], + alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ", + wiki_start_pages=["Πύλη:Κύρια"], + ), + "English": Language( + name="English", + iso_code="en", + use_ascii=True, + charsets=["ISO-8859-1", "WINDOWS-1252"], + wiki_start_pages=["Main_Page"], + ), + "Esperanto": Language( + name="Esperanto", + iso_code="eo", + # Q, W, X, and Y not used at all + use_ascii=False, + charsets=["ISO-8859-3"], + alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ", + wiki_start_pages=["Vikipedio:Ĉefpaĝo"], + ), + "Spanish": Language( + name="Spanish", + iso_code="es", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + alphabet="ñáéíóúüÑÁÉÍÓÚÜ", + wiki_start_pages=["Wikipedia:Portada"], + ), + "Estonian": Language( + name="Estonian", + iso_code="et", + use_ascii=False, + charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"], + # C, F, Š, Q, W, X, Y, Z, Ž are only for + # loanwords + alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü", + wiki_start_pages=["Esileht"], + ), + "Finnish": Language( + name="Finnish", + iso_code="fi", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + alphabet="ÅÄÖŠŽåäöšž", + wiki_start_pages=["Wikipedia:Etusivu"], + ), + "French": Language( + name="French", + iso_code="fr", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", + wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], + ), + "Hebrew": Language( + name="Hebrew", + iso_code="he", + use_ascii=False, + charsets=["ISO-8859-8", "WINDOWS-1255"], + alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ", + wiki_start_pages=["עמוד_ראשי"], + ), + "Croatian": Language( + name="Croatian", + iso_code="hr", + # Q, W, X, Y are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ", + wiki_start_pages=["Glavna_stranica"], + ), + "Hungarian": Language( + name="Hungarian", + iso_code="hu", + # Q, W, X, Y are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ", + wiki_start_pages=["Kezdőlap"], + ), + "Italian": Language( + name="Italian", + iso_code="it", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + alphabet="ÀÈÉÌÒÓÙàèéìòóù", + wiki_start_pages=["Pagina_principale"], + ), + "Lithuanian": Language( + name="Lithuanian", + iso_code="lt", + use_ascii=False, + charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], + # Q, W, and X not used at all + alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž", + wiki_start_pages=["Pagrindinis_puslapis"], + ), + "Latvian": Language( + name="Latvian", + iso_code="lv", + use_ascii=False, + charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], + # Q, W, X, Y are only for loanwords + alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž", + wiki_start_pages=["Sākumlapa"], + ), + "Macedonian": Language( + name="Macedonian", + iso_code="mk", + use_ascii=False, + charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], + alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш", + wiki_start_pages=["Главна_страница"], + ), + "Dutch": Language( + name="Dutch", + iso_code="nl", + use_ascii=True, + charsets=["ISO-8859-1", "WINDOWS-1252"], + wiki_start_pages=["Hoofdpagina"], + ), + "Polish": Language( + name="Polish", + iso_code="pl", + # Q and X are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż", + wiki_start_pages=["Wikipedia:Strona_główna"], + ), + "Portuguese": Language( + name="Portuguese", + iso_code="pt", + use_ascii=True, + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", + wiki_start_pages=["Wikipédia:Página_principal"], + ), + "Romanian": Language( + name="Romanian", + iso_code="ro", + use_ascii=True, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="ăâîșțĂÂÎȘȚ", + wiki_start_pages=["Pagina_principală"], + ), + "Russian": Language( + name="Russian", + iso_code="ru", + use_ascii=False, + charsets=[ + "ISO-8859-5", + "WINDOWS-1251", + "KOI8-R", + "MacCyrillic", + "IBM866", + "IBM855", + ], + alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", + wiki_start_pages=["Заглавная_страница"], + ), + "Slovak": Language( + name="Slovak", + iso_code="sk", + use_ascii=True, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ", + wiki_start_pages=["Hlavná_stránka"], + ), + "Slovene": Language( + name="Slovene", + iso_code="sl", + # Q, W, X, Y are only used for foreign words. + use_ascii=False, + charsets=["ISO-8859-2", "WINDOWS-1250"], + alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ", + wiki_start_pages=["Glavna_stran"], + ), + # Serbian can be written in both Latin and Cyrillic, but there's no + # simple way to get the Latin alphabet pages from Wikipedia through + # the API, so for now we just support Cyrillic. + "Serbian": Language( + name="Serbian", + iso_code="sr", + alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш", + charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], + wiki_start_pages=["Главна_страна"], + ), + "Thai": Language( + name="Thai", + iso_code="th", + use_ascii=False, + charsets=["ISO-8859-11", "TIS-620", "CP874"], + alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛", + wiki_start_pages=["หน้าหลัก"], + ), + "Turkish": Language( + name="Turkish", + iso_code="tr", + # Q, W, and X are not used by Turkish + use_ascii=False, + charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"], + alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ", + wiki_start_pages=["Ana_Sayfa"], + ), + "Vietnamese": Language( + name="Vietnamese", + iso_code="vi", + use_ascii=False, + # Windows-1258 is the only common 8-bit + # Vietnamese encoding supported by Python. + # From Wikipedia: + # For systems that lack support for Unicode, + # dozens of 8-bit Vietnamese code pages are + # available.[1] The most common are VISCII + # (TCVN 5712:1993), VPS, and Windows-1258.[3] + # Where ASCII is required, such as when + # ensuring readability in plain text e-mail, + # Vietnamese letters are often encoded + # according to Vietnamese Quoted-Readable + # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] + # though usage of either variable-width + # scheme has declined dramatically following + # the adoption of Unicode on the World Wide + # Web. + charsets=["WINDOWS-1258"], + alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY", + wiki_start_pages=["Chữ_Quốc_ngữ"], + ), +} |