summaryrefslogtreecommitdiffstats
path: root/lib/chardet/metadata/languages.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/chardet/metadata/languages.py')
-rw-r--r--lib/chardet/metadata/languages.py351
1 files changed, 351 insertions, 0 deletions
diff --git a/lib/chardet/metadata/languages.py b/lib/chardet/metadata/languages.py
new file mode 100644
index 0000000..1d37884
--- /dev/null
+++ b/lib/chardet/metadata/languages.py
@@ -0,0 +1,351 @@
+"""
+Metadata about languages used by our model training code for our
+SingleByteCharSetProbers. Could be used for other things in the future.
+
+This code is based on the language metadata from the uchardet project.
+"""
+
+from string import ascii_letters
+
+# TODO: Add Ukrainian (KOI8-U)
+
+
+class Language:
+ """Metadata about a language useful for training models
+
+ :ivar name: The human name for the language, in English.
+ :type name: str
+ :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
+ or use another catalog as a last resort.
+ :type iso_code: str
+ :ivar use_ascii: Whether or not ASCII letters should be included in trained
+ models.
+ :type use_ascii: bool
+ :ivar charsets: The charsets we want to support and create data for.
+ :type charsets: list of str
+ :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
+ `True`, you only need to add those not in the ASCII set.
+ :type alphabet: str
+ :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
+ Wikipedia for training data.
+ :type wiki_start_pages: list of str
+ """
+
+ def __init__(
+ self,
+ name=None,
+ iso_code=None,
+ use_ascii=True,
+ charsets=None,
+ alphabet=None,
+ wiki_start_pages=None,
+ ):
+ super().__init__()
+ self.name = name
+ self.iso_code = iso_code
+ self.use_ascii = use_ascii
+ self.charsets = charsets
+ if self.use_ascii:
+ if alphabet:
+ alphabet += ascii_letters
+ else:
+ alphabet = ascii_letters
+ elif not alphabet:
+ raise ValueError("Must supply alphabet if use_ascii is False")
+ self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
+ self.wiki_start_pages = wiki_start_pages
+
+ def __repr__(self):
+ param_str = ", ".join(
+ f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
+ )
+ return f"{self.__class__.__name__}({param_str})"
+
+
+LANGUAGES = {
+ "Arabic": Language(
+ name="Arabic",
+ iso_code="ar",
+ use_ascii=False,
+ # We only support encodings that use isolated
+ # forms, because the current recommendation is
+ # that the rendering system handles presentation
+ # forms. This means we purposefully skip IBM864.
+ charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
+ alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
+ wiki_start_pages=["الصفحة_الرئيسية"],
+ ),
+ "Belarusian": Language(
+ name="Belarusian",
+ iso_code="be",
+ use_ascii=False,
+ charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
+ alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
+ wiki_start_pages=["Галоўная_старонка"],
+ ),
+ "Bulgarian": Language(
+ name="Bulgarian",
+ iso_code="bg",
+ use_ascii=False,
+ charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
+ alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
+ wiki_start_pages=["Начална_страница"],
+ ),
+ "Czech": Language(
+ name="Czech",
+ iso_code="cz",
+ use_ascii=True,
+ charsets=["ISO-8859-2", "WINDOWS-1250"],
+ alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
+ wiki_start_pages=["Hlavní_strana"],
+ ),
+ "Danish": Language(
+ name="Danish",
+ iso_code="da",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
+ alphabet="æøåÆØÅ",
+ wiki_start_pages=["Forside"],
+ ),
+ "German": Language(
+ name="German",
+ iso_code="de",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "WINDOWS-1252"],
+ alphabet="äöüßÄÖÜ",
+ wiki_start_pages=["Wikipedia:Hauptseite"],
+ ),
+ "Greek": Language(
+ name="Greek",
+ iso_code="el",
+ use_ascii=False,
+ charsets=["ISO-8859-7", "WINDOWS-1253"],
+ alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
+ wiki_start_pages=["Πύλη:Κύρια"],
+ ),
+ "English": Language(
+ name="English",
+ iso_code="en",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "WINDOWS-1252"],
+ wiki_start_pages=["Main_Page"],
+ ),
+ "Esperanto": Language(
+ name="Esperanto",
+ iso_code="eo",
+ # Q, W, X, and Y not used at all
+ use_ascii=False,
+ charsets=["ISO-8859-3"],
+ alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
+ wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
+ ),
+ "Spanish": Language(
+ name="Spanish",
+ iso_code="es",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
+ alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
+ wiki_start_pages=["Wikipedia:Portada"],
+ ),
+ "Estonian": Language(
+ name="Estonian",
+ iso_code="et",
+ use_ascii=False,
+ charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
+ # C, F, Š, Q, W, X, Y, Z, Ž are only for
+ # loanwords
+ alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
+ wiki_start_pages=["Esileht"],
+ ),
+ "Finnish": Language(
+ name="Finnish",
+ iso_code="fi",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
+ alphabet="ÅÄÖŠŽåäöšž",
+ wiki_start_pages=["Wikipedia:Etusivu"],
+ ),
+ "French": Language(
+ name="French",
+ iso_code="fr",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
+ alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
+ wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
+ ),
+ "Hebrew": Language(
+ name="Hebrew",
+ iso_code="he",
+ use_ascii=False,
+ charsets=["ISO-8859-8", "WINDOWS-1255"],
+ alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
+ wiki_start_pages=["עמוד_ראשי"],
+ ),
+ "Croatian": Language(
+ name="Croatian",
+ iso_code="hr",
+ # Q, W, X, Y are only used for foreign words.
+ use_ascii=False,
+ charsets=["ISO-8859-2", "WINDOWS-1250"],
+ alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
+ wiki_start_pages=["Glavna_stranica"],
+ ),
+ "Hungarian": Language(
+ name="Hungarian",
+ iso_code="hu",
+ # Q, W, X, Y are only used for foreign words.
+ use_ascii=False,
+ charsets=["ISO-8859-2", "WINDOWS-1250"],
+ alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
+ wiki_start_pages=["Kezdőlap"],
+ ),
+ "Italian": Language(
+ name="Italian",
+ iso_code="it",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
+ alphabet="ÀÈÉÌÒÓÙàèéìòóù",
+ wiki_start_pages=["Pagina_principale"],
+ ),
+ "Lithuanian": Language(
+ name="Lithuanian",
+ iso_code="lt",
+ use_ascii=False,
+ charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
+ # Q, W, and X not used at all
+ alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
+ wiki_start_pages=["Pagrindinis_puslapis"],
+ ),
+ "Latvian": Language(
+ name="Latvian",
+ iso_code="lv",
+ use_ascii=False,
+ charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
+ # Q, W, X, Y are only for loanwords
+ alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
+ wiki_start_pages=["Sākumlapa"],
+ ),
+ "Macedonian": Language(
+ name="Macedonian",
+ iso_code="mk",
+ use_ascii=False,
+ charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
+ alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
+ wiki_start_pages=["Главна_страница"],
+ ),
+ "Dutch": Language(
+ name="Dutch",
+ iso_code="nl",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "WINDOWS-1252"],
+ wiki_start_pages=["Hoofdpagina"],
+ ),
+ "Polish": Language(
+ name="Polish",
+ iso_code="pl",
+ # Q and X are only used for foreign words.
+ use_ascii=False,
+ charsets=["ISO-8859-2", "WINDOWS-1250"],
+ alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
+ wiki_start_pages=["Wikipedia:Strona_główna"],
+ ),
+ "Portuguese": Language(
+ name="Portuguese",
+ iso_code="pt",
+ use_ascii=True,
+ charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
+ alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
+ wiki_start_pages=["Wikipédia:Página_principal"],
+ ),
+ "Romanian": Language(
+ name="Romanian",
+ iso_code="ro",
+ use_ascii=True,
+ charsets=["ISO-8859-2", "WINDOWS-1250"],
+ alphabet="ăâîșțĂÂÎȘȚ",
+ wiki_start_pages=["Pagina_principală"],
+ ),
+ "Russian": Language(
+ name="Russian",
+ iso_code="ru",
+ use_ascii=False,
+ charsets=[
+ "ISO-8859-5",
+ "WINDOWS-1251",
+ "KOI8-R",
+ "MacCyrillic",
+ "IBM866",
+ "IBM855",
+ ],
+ alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
+ wiki_start_pages=["Заглавная_страница"],
+ ),
+ "Slovak": Language(
+ name="Slovak",
+ iso_code="sk",
+ use_ascii=True,
+ charsets=["ISO-8859-2", "WINDOWS-1250"],
+ alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
+ wiki_start_pages=["Hlavná_stránka"],
+ ),
+ "Slovene": Language(
+ name="Slovene",
+ iso_code="sl",
+ # Q, W, X, Y are only used for foreign words.
+ use_ascii=False,
+ charsets=["ISO-8859-2", "WINDOWS-1250"],
+ alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
+ wiki_start_pages=["Glavna_stran"],
+ ),
+ # Serbian can be written in both Latin and Cyrillic, but there's no
+ # simple way to get the Latin alphabet pages from Wikipedia through
+ # the API, so for now we just support Cyrillic.
+ "Serbian": Language(
+ name="Serbian",
+ iso_code="sr",
+ alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
+ charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
+ wiki_start_pages=["Главна_страна"],
+ ),
+ "Thai": Language(
+ name="Thai",
+ iso_code="th",
+ use_ascii=False,
+ charsets=["ISO-8859-11", "TIS-620", "CP874"],
+ alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
+ wiki_start_pages=["หน้าหลัก"],
+ ),
+ "Turkish": Language(
+ name="Turkish",
+ iso_code="tr",
+ # Q, W, and X are not used by Turkish
+ use_ascii=False,
+ charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
+ alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
+ wiki_start_pages=["Ana_Sayfa"],
+ ),
+ "Vietnamese": Language(
+ name="Vietnamese",
+ iso_code="vi",
+ use_ascii=False,
+ # Windows-1258 is the only common 8-bit
+ # Vietnamese encoding supported by Python.
+ # From Wikipedia:
+ # For systems that lack support for Unicode,
+ # dozens of 8-bit Vietnamese code pages are
+ # available.[1] The most common are VISCII
+ # (TCVN 5712:1993), VPS, and Windows-1258.[3]
+ # Where ASCII is required, such as when
+ # ensuring readability in plain text e-mail,
+ # Vietnamese letters are often encoded
+ # according to Vietnamese Quoted-Readable
+ # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
+ # though usage of either variable-width
+ # scheme has declined dramatically following
+ # the adoption of Unicode on the World Wide
+ # Web.
+ charsets=["WINDOWS-1258"],
+ alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
+ wiki_start_pages=["Chữ_Quốc_ngữ"],
+ ),
+}