1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Module to guess the language ISO code based on the 'Language-Team entry in
22 the header of a Gettext PO file."""
23
24 import re
25
26 from translate.misc.typecheck import accepts, returns, IsOneOf
27 from translate.misc.typecheck.typeclasses import String
28
29 __all__ = ['LANG_TEAM_CONTACT_SNIPPETS', 'guess_language']
30
31 LANG_TEAM_REGEX = (
32 ("@li.org", "([a-z_A-Z]{2,})@li.org", ["LL", "XX", "TEAM"]),
33 ("translation-team",
34 "translation-team-([a-z_A-Z]+)@lists.sourceforge.net", None),
35 ("fedora-trans", "fedora-trans-([a-z_A-Z]+)@redhat.com", ["list"]),
36 ("ubuntu-l10n", "ubuntu-l10n-([a-z_A-Z]+)@lists.ubuntu.com", None),
37 ("translate-discuss",
38 "translate-discuss-([a-z_A-Z]+)@lists.sourceforge.net", None),
39 ("kde-i18n", "kde-i18n-([a-z_A-Z]+)@(?:lists\.|mail\.|)kde.org", ["doc"]),
40 ("kde-l10n", "kde-l10n-([a-z_A-Z]+)@kde.org", None),
41 ("fedoraproject", "trans-([a-z_A-Z]+)@lists.fedoraproject.org", None),
42 ("gnome.org", "gnome-([a-z_A-Z]+)-list@gnome.org", ["latin"]),
43 )
44 """Data for regular expression based extraction. The fieds are: prefilter
45 information, regex with single group that contains the language code,
46 postfilter."""
47
48 LANG_TEAM_CONTACT_SNIPPETS = {
49 "af": ("i18n@af.org.za", "Petri Jooste",),
50 "am": ("@geez.org", ),
51 "ar": ("arabeyes.org", "Arabeyes", ),
52 "as": ("assam@mm.assam-glug.org", ),
53 "ast": ("@softastur.org", "launchpad.net/~ubuntu-l10n-ast",
54 "softast-xeneral@lists.sourceforge.net", "Softastur",),
55 "az": ("linuxaz@azerimal.net", "gnome@azitt.com", u"gnome@azətt.com",),
56 "az_IR": ("az-ir@lists.sharif.edu",),
57 "be": ("i18n@mova.org", "i18n@tut.by", "mozilla_byx@poczta.fm",),
58 "be@latin": ("translation-team-be-latin@lists", "be-latin.open-tran.eu",),
59 "bg": ("dict@fsa-bg.org", "dict@linux.zonebg.com", ),
60 "bn": ("gnome-translation@bengalinux.org", "core@bengalinux.org",
61 "ankur-bd-l10n@googlegroups.com",
62 "redhat-translation@bengalinux.org", ),
63 "bn_IN": ("anubad@lists.ankur.org.in", ),
64 "br": ("drouizig@drouizig.org", "brenux@free.fr",
65 "tradgnome@softcatala.net", "fedora@softcatala.org", ),
66 "bs": ("lokal@linux.org.ba", "lokal@lugbih.org", ),
67 "ca": ("@softcatala.org",),
68 "crh": ("tilde-birlik-tercime@lists.sourceforge.net", ),
69 "cs": ("fedora-cs-list@redhat.com", "cs-users@lists.fedoraproject.org",
70 "debian-l10n-czech@lists.debian.org",
71 "kde-czech-apps@lists.sourceforge.net",
72 "kde-czech-apps@lists.sf.net", "translations.cs@gnupg.cz"),
73 "cy": ("gnome-cy@lists.linux.org.uk", "gnome-cy@pengwyn.linux.org.uk",
74 "gnome-cy@www.linux.org", "gnome-cy@www.linux.org.uk",
75 "cy@pengwyn.linux.org.uk", ),
76 "da": ("dansk@dansk-gruppen.dk", "dansk@klid.dk",
77 "sslug-locale@sslug.dk", ),
78 "de": ("gnome-de@gnome.org", "debian-l10n-german@lists.debian.org", ),
79 "dz": ("pgeyleg@dit.gov.bt", "pgyeleg@dit.gov.bt", ),
80 "el": ("debian-l10n-greek@lists.debian.org", "i18ngr@lists.hellug.gr",
81 "i18n@hellug.gr", "nls@tux.hellug.gr", "team@gnome.gr",
82 "team@lists.gnome.gr", "users@el.openoffice.org", ),
83 "en_AU": ("trans@six-by-nine.com.au", ),
84 "en_CA": ("adamw@gnome.org", "adamw@freebsd.org", ),
85 "en_GB": ("kde-en-gb@kde.me.uk", ),
86 "en@shaw": ("ubuntu-l10n-en-shaw@launchpad.net",
87 "ubuntu-l10n-en-shaw@lists.launchpad.net", ),
88 "eo": ("eo-tradukado@lists.tuxfamily.org",
89 "debian-l10n-esperanto@lists.debian.org",
90 "ubuntu-l10n-eo@lists.launchpad.net",
91 "eo-tradukado.tuxfamily.org", ),
92 "es": ("pgsql-es-ayuda@postgresql.org",
93 "debian-l10n-spanish@lists.debian.org",
94 "gnome-es@gnome.org", "traductores@es.gnome.org", ),
95 "et": ("gnome-et@linux.ee", "kde-et@linux.ee", "linux-ee@lists.eenet.ee",
96 "linux-et@lists.eenet.ee", "et-gnome@linux.ee",
97 "linux-ee@eenet.ee", ),
98 "eu": ("debian-l10n-basque@lists.debian.org",
99 "debian-l10n-eu@lists.debian.org", "itzulpena@euskalgnu.org",
100 "gnome@euskalgnu.org", "librezale@librezale.org",
101 "linux-eu@chanae.alphanet.ch", ),
102 "fa": ("farsi@lists.sharif.edu", "Farsiweb.info", ),
103 "fi": ("debian-l10n-finnish@lists.debian.org",
104 "gnome-fi-laatu@lists.sourceforge.net", "laatu@lokalisointi.org",
105 "lokalisointi-laatu@linux-aktivaattori.org", "laatu@gnome.fi",
106 "yast-trans-fi@kotoistaminen.novell.fi", ),
107 "fr": ("debian-l10n-french@lists.debian.org", "gnomefr@traduc.org",
108 "kde-francophone@kde.org", "traduc@traduc.org",
109 "pgsql-fr-generale@postgresql.org", "rpm-fr@livna.org", ),
110 "ga": ("gaeilge-gnulinux@lists.sourceforge.net",
111 "gaeilge-a@listserv.heanet.ie", ),
112 "gl": ("trasno@ceu.fi.udc.es", "gnome@g11n.net",
113 "gpul-traduccion@ceu.fi.udc.es", "proxecto@trasno.net",
114 "trasno@gpul.org", ),
115 "gu": ("indianoss-gujarati@lists.sourceforge.net", ),
116 "he": ("debian-hebrew-common@lists.alioth.debian.org",
117 "kde-il@yahoogroups.com", "fedora-he-list@redhat.com",
118 "mdk-hebrew@iglu.org.il", ),
119 "hi": ("indlinux-hindi-gnome@lists.sourceforge.net",
120 "indlinux-hindi@lists.sourceforge.net", ),
121 "hr": ("translator-shop.org", "lokalizacija@linux.hr", ),
122 "hu": ("debian-l10n-hungarian@lists.debian.org", "gnome@fsf.hu",
123 "gnome@gnome.hu", "magyar@lists.linux.hu", ),
124 "id": ("@id.gnome.org", "@gnome.linux.or.id", "mdk-id@yahoogroups.com",
125 "linux.or.id", "gnome@i15n.org"),
126 "io": ("gnome-ido@lists.mterry.name", ),
127 "is": ("gnome@techattack.nu", "kde-isl@mmedia.is", "kde-isl@molar.is", ),
128 "it": ("debian-l10n-italian@lists.debian.org", "traduzioni@itpug.org",
129 "fedora-trans-it@redhat.com", "tp@lists.linux.it", ),
130 "ja": ("debian-doc@debian.or.jp", "debian-japanese@lists.debian.org",
131 "gnome-translation@gnome.gr.jp", "translation@gnome.gr.jp",
132 "jpug-doc@ml.postgresql.jp", ),
133 "ka": ("geognome@googlegroups.com",
134 "Ubuntu-Georgian-Translators@googlegroups.com", ),
135 "kk": ("kk_KZ@googlegroups.com", ),
136 "km": ("@khmeros.info", ),
137 "kn": ("debian-l10n-kannada@lists.debian.org", ),
138 "ko": ("gnome-kr-hackers@list.kldp.net", "gnome-kr-hackers@lists.kldp.net",
139 "gnome-kr-translation@lists.kldp.net", "pgsql-kr@postgresql.or.kr",
140 "hangul-hackers@lists.kldp.net",
141 "debian-l10n-korean@lists.debian.org",
142 "gnome-kr-translation@lists.sourceforge.net", ),
143 "ks": ("ks-gnome-trans-commits@lists.code.indlinux.net", ),
144 "ku": ("gnu-ku-wergerandin@lists.sourceforge.net", ),
145 "ky": ("i18n-team-ky-kyrgyz@lists.sourceforge.net", "ky-li@mail.ru", ),
146 "la": ("gnome-latin-list@gnome.org", ),
147 "li": ("li@gnome.org", ),
148 "lt": ("gimp-lt@lists.akl.lt", "gnome-lt@lists.akl.lt",
149 "gnome-lt@lists.gnome.org", "komp_lt@konferencijos.lt", ),
150 "lv": ("lata-l10n@googlegroups.com", "lata-i18n@groups.google.com",
151 "locale@laka.lv", "ll10nt@os.lv", ),
152 "mai": ("maithili.sf.net", ),
153 "mg": ("i18n-malagasy-gnome@gnome.org", ),
154 "mi": ("maori@nzlinux.org.nz", ),
155 "mk": ("gnomk-main@lists.sourceforge.net", "lug@lists.linux.net.mk",
156 "mkde-l10n@lists.sourceforge.net",
157 "ossm-members@hedona.on.net.mk", ),
158 "ml": ("smc-discuss@googlegroups.com", ),
159 "mn": ("openmn-", "openmn.org", ),
160 "ms": ("gabai-penyumbang@lists.sourceforge.net",
161 "gabai-penyumbang@lists.sf.net", "kedidiemas@yahoogroups.com", ),
162 "nb": ("i18n-nb@lister.ping.uio.no", ),
163 "nds": ("nds-lowgerman@lists.sourceforge.net", ),
164 "ne": ("info@mpp.org.np", ),
165 "nl": ("debian-l10n-dutch@lists.debian.org", "vertaling@nl.gnome.org",
166 "vertaling@vrijschrift.org", "nl@vrijschrift.org",
167 "vertaling@nl.linux.org", "vertaling@nl.li.org", ),
168 "nn": ("i18n-nn@lister.ping.uio.no", ),
169 "nso": ("sepedi@translate.org.za", ),
170 "or": ("oriya-group@lists.sarovar.org", "oriya-it@googlegroups.com", ),
171 "pa": ("punjabi-l10n@users.sf.net", "fedora-pa-list@redhat.com",
172 "punjabi-users@lists.sf.net", "punjabi-l10n@lists.sourceforge.net",
173 "punlinux-i18n@lists.sourceforge.net", ),
174 "pl": ("gnomepl@aviary.pl", "debian-l10n-polish@lists.debian.org",
175 "gnome-l10n@lists.aviary.pl", "translators@gnomepl.org", ),
176 "ps": ("pathanisation@googelgroups.com", ),
177 "pt": ("fedora-trans-pt@redhat.org", "gnome_pt@yahoogroups.com",
178 "traduz@debianpt.org", "traduz@debian.pt", ),
179 "pt_BR": ("gnome-l10n-br@listas.cipsga.org.br",
180 "gnome-pt_br-list@gnome.org", "fedora-docs-br@redhat.com",
181 "fedora-trans-pt-br@redhat.com", "ldp-br@bazar.conectiva.com.br",
182 "pgbr-dev@postgresql.org.br",
183 "pgbr-dev@listas.postgresql.org.br",
184 "debian-l10n-portuguese@lists.debian.org", ),
185 "ro": ("fedora-ro@googlegroups.com", "gnomero-list@lists.sourceforge.net",
186 "debian-l10n-romanian@lists.debian.org", ),
187 "ru": ("pgsql-rus@yahoogroups.com", "debian-l10n-russian@lists.debian.org",
188 "gnupg-ru@gnupg.org", ),
189 "sk": ("sk-i18n@lists.linux.sk", "kde-sk@linux.sk", ),
190 "sl": ("gnome-si@googlegroups.com", ),
191 "sq": ("gnome-albanian-perkthyesit@lists.sourceforge.net",
192 "debian-l10n-albanian@lists.debian.org", ),
193 "sr": ("@prevod.org", "serbiangnome-lista@nongnu.org", ),
194 "sv": ("debian-l10n-swedish@lists.debian.org", "tp-sv@listor.tp-sv.se", ),
195 "ta": ("gnome-tamil-translation@googlegroups.com",
196 "tamilinix@yahoogroups.com", "Ubuntu-l10n-tam@lists.ubuntu.com",
197 "tamil-DI@yahoogroups.com", ),
198 "te": ("localisation@swecha.org",
199 "indlinux-telugu@lists.sourceforge.net", ),
200 "th": ("l10n@opentle.org", "thai-l10n@googlegroup.com",
201 "thailang@buraphalinux.org", "thai-l10n@googlegroups.com",
202 "l10n.opentle.org", ),
203 "tk": ("kakilikgroup@yahoo.com", ),
204 "tl": ("debian-tl@banwa.upm.edu.ph", ),
205 "tr": ("debian-l10n-turkish@lists.debian.org", "gnome-turk@gnome.org",
206 "gnu-tr-u12a@lists.sourceforge.net", "turkce@pardus.org.tr", ),
207 "tt": ("tatarish.l10n@gmail.com", ),
208 "ug": ("gnome-uighur@yahoogroups.com", ),
209 "uk": ("linux@linux.org.ua", ),
210 "ur": ("l10n@urduweb.org", "urdu.scs.gift@gmail.com", ),
211 "ve": ("venda@translate.org.za", ),
212 "vi": ("gnomevi-list@lists.sourceforge.net", "vi-VN@googlegroups.com", ),
213 "wa": ("linux-wa@", ),
214 "xh": ("xh-translate@ubuntu.com", "xhosa@translate.org.za",
215 "xhosa@ubuntu.com", ),
216 "zh_CN": ("i18n-translation@lists.linux.net.cn",
217 "i18n-zh@googlegroups.com",
218 "translation-team-zh-cn@lists.sourceforge.net",
219 "i18n-zh@googlegroup.com", ),
220 "zh_TW": ("zh-l10n@lists.linux.org.tw", "chinese-l10n@googlegroups.com",
221 "community@linuxhall.org", "zh-l10n@linux.org.tw", ),
222 "zu": ("zulu@translate.org.za", ),
223 }
224 """Language codes with snippets of contact information that can be used to
225 uniquely identify the language"""
226
227 LANG_TEAM_LANGUAGE_SNIPPETS = {
228 "af": ("Afrikaans",),
229 "am": ("Amharic",),
230 "ang": ("Old English",),
231 "ar": ("Arabic", ),
232 "as": ("Assamese", ),
233 "ast": ("Asturian", ),
234 "az": ("Azerbaijani", u"Azərbaycan", ),
235 "bg": ("Bulgarian", ),
236 "be@latin": ("Belarusian Latin", ),
237 "be": ("Belarusian", "Belorussian", ),
238 "bn_IN": ("Bengali (India)", "Bengali INDIA", "Bengali India", ),
239 "bn": ("Bangladeshi", "Bengali", ),
240 "br": ("Breton", "Britton", ),
241 "bs": ("Bosanski", "Bosnian", ),
242 "byn": ("Blin", ),
243 "ca": ("Catalan", ),
244 "ckb": ("Kurdish (Sorani)", ),
245 "crh": ("Crimean Tatar", "Crimean Turkish", ),
246 "cs": ("Czech", ),
247 "cy": ("Cymru", "Welsh", ),
248 "da": ("Danish", "Dansk", ),
249 "de": ("Deutsch", "German", ),
250 "dz": ("Dzongkha", ),
251 "el": ("Greek", ),
252 "en_GB": ("British English", "en_GB", "English (Great Britain)", ),
253 "eo": ("Esperanto", ),
254 "es": ("Spanish", "es_ES", u"Español", ),
255 "et": ("Eesti", "Estonian", ),
256 "eu": ("Basque", "Euskara", ),
257 "fa": ("Persian", ),
258 "fi": ("Finnish", "Suomi", ),
259 "fo": ("Faroese", ),
260 "fr": ("French", u"Français", ),
261 "fur": ("Friulian", ),
262 "ga": ("Irish", ),
263 "gez": ("Geez", ),
264 "gl": ("Galego", "Galician", "Gallegan", "gl_ES", ),
265 "gu": ("Gujarati", ),
266 "haw": ("Hawaiian", ),
267 "he": ("Hebrew", ),
268 "hi": ("Hindi", ),
269 "hr": ("Croatian", ),
270 "hu": ("Hungarian", ),
271 "hy": ("Armenian", ),
272 "ia": ("Interlingua", ),
273 "id": ("Bahasa Indonesia", "Indonesia", "Indonesian", ),
274 "ig": ("Igbo", ),
275 "is": ("Icelandic", ),
276 "it": ("Italian", ),
277 "ja": ("Japanese", ),
278 "ka": ("Georgian", ),
279 "kk": ("Kazakh", ),
280 "km": ("Khmer", ),
281 "kn": ("Kannada", ),
282 "ko": ("Korean", "Hangul", ),
283 "kok": ("Konkani", ),
284 "ks": ("Kashmiri", ),
285 "ku": ("Kurdish", ),
286 "ky": ("Kitghiz", "Kirghiz", ),
287 "lg": ("Luganda", ),
288 "li": ("Limburgish", ),
289 "lt": ("Lithuanian", ),
290 "lv": ("Latvian", "lv_LV", "Valoda", u"Latviešu", ),
291 "mal": ("Malayalam", ),
292 "mg": ("Malagasy", ),
293 "mi": ("Maori", ),
294 "mk": ("Macedonian", ),
295 "ml": ("Malayalam", ),
296 "mn": ("Mongolian", ),
297 "mt": ("Marathi", ),
298 "ms": ("Malay", "Bahasa Melayu", ),
299 "my": ("Burmese", ),
300 "nb": ("Norwegian Bokmaal", u"Norsk bokmål", u"Norwegian Bokmål",
301 u"Norwegian bokmål", ),
302 "nds": ("Low Saxon", ),
303 "nl": ("Dutch", "Nederlands", ),
304 "nn": ("Norwegian nynorsk", "Nynorsk", ),
305 "oc": ("Occitan", ),
306 "or": ("Oriya", ),
307 "pa": ("Punjabi", "Panjabi", ),
308 "pl": ("Polish", ),
309 "ps": ("Pashto", "Pushto", ),
310 "pt_BR": ("Brazilian Portuguese", u"Português/Brasil",
311 u"Português do Brasil", ),
312 "pt": ("Portuguese", ),
313 "rm": ("Rhaeto-Romance", ),
314 "ro": ("Romania", "Romanian", u"Română", ),
315 "ru": ("Russian", ),
316 "si": ("Sinhala", "Sinhalese", ),
317 "sk": ("Slovak", ),
318 "sl": ("Slovene", "Slovenian", ),
319 "so": ("Somali", ),
320 "sq": ("Albanian", ),
321 "sr": ("Serbian", ),
322 "sv": ("Swedish", ),
323 "sw": ("Swahili", ),
324 "ta": ("Tamil", ),
325 "te": ("Telugu", ),
326 "tet": ("Tetum", ),
327 "tg": ("Tajik", ),
328 "th": ("Thai", ),
329 "ti": ("Tigrinya", ),
330 "tig": ("Tigre", ),
331 "tl": ("Tagalog", ),
332 "tr": ("Turkish", u"Türkçe", u"Türkiye", ),
333 "tt": ("Tatarish", ),
334 "ug": ("Uighur", ),
335 "uk": ("Ukrainian", ),
336 "ur": ("Urdu", ),
337 "uz": ("Uzbek", ),
338 "ve": ("Venda", u"Tshivenḓa", "Tshivenda", ),
339 "vi": ("Vietnamese", ),
340 "wa": ("Walloon", ),
341 "wal": ("Walamo", ),
342 "wo": ("Wolof", ),
343 "xh": ("Xhosa", "IsiXhosa", "isiXhosa", ),
344 "yi": ("Yiddish", ),
345 "yo": ("Yoruba", ),
346 "zh_CN": ("Chinese Simplified", "Chinese/Simplified",
347 "Chinese (simplified)", "Simplified Chinese", ),
348 "zh_HK": ("Chinese (Hong Kong)", ),
349 "zh_TW": ("Chinese (traditional)", "Chinese/Traditional",
350 "Traditional Chinese", ),
351 }
352 """Language codes with snippets of language names, including English, native
353 spelling and varients, that can be used to uniquely identify the language"""
354
355
356 -def _regex_guesser(prefilter, regex, string, postfilter=None):
357 """Use regular expressions to extract the language team
358
359 @param prefilter: simple filter to apply before attempting the regex
360 @param regex: regular expression with one group that will contain
361 the language code
362 @param string: the language team string that should be examined
363 @param postfilter: filter to apply to reject any potential matches
364 after they have been retreived by the regex
365 @return: ISO language code for the found language
366 """
367
368
369
370
371
372 if prefilter in string:
373 found = re.search(regex, string)
374 if found:
375 regex_lang = found.groups()[0]
376 else:
377 return None
378 if postfilter is not None and regex_lang in postfilter:
379 return None
380 if regex_lang and regex_lang != 'en':
381 return regex_lang
382 return None
383
386 """Return the supplied text unchanged"""
387 return text
388
391 """Convert the supplied text to lowercase"""
392 return text.lower()
393
396 """Guess the language based on a snippet of text in the language team
397 string.
398
399 @param snippets_dict: A dict of snippets that can be used to identify a
400 language in the format {'lang': ('snippet1', 'snippet2'), 'lang2'...}
401 @param string: The language string to be analysed
402 @param filter_: a function to be applied to the string and snippets
403 before examination
404 """
405 string = filter_(string)
406 for possible_lang, snippets in snippets_dict.iteritems():
407 for snippet in snippets:
408 if filter_(snippet) in string:
409 return possible_lang
410 return None
411
412
413 @accepts(unicode)
414 @returns(IsOneOf(String, type(None)))
415 -def guess_language(team_string):
436
437 if __name__ == "__main__":
438 from sys import argv
439 from translate.storage import factory
440 for fname in argv[1:]:
441 store = factory.getobject(fname)
442 print fname, guess_language(store.parseheader().get('Language-Team', u""))
443