1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """This module contains functions for identifying languages based on language models.
22
23 It wraps U{libtextcat<http://software.wise-guys.nl/libtextcat/>} to get the language
24 identification functionality.
25
26 To use first create an instance of I{LanguageIdentifier} and then use the methods
27 I{identify} or I{identify_store} to detect the language in a string or in a translation
28 store respectively.
29 """
30
31 from ctypes import *
32 import ctypes.util
33
34 from translate.lang.data import *
35
36
37 textcat = None
38
39
40 names = ['textcat', 'libtextcat']
41 for name in names:
42 lib_location = ctypes.util.find_library(name)
43 if lib_location:
44 textcat = cdll.LoadLibrary(lib_location)
45 if textcat:
46 break
47 else:
48
49
50 try:
51 textcat = cdll.LoadLibrary('libtextcat.so')
52 except OSError, e:
53 raise ImportError("textcat library not found")
54
55
56 textcat.textcat_Init.argtypes = [c_char_p]
57 textcat.textcat_Init.retype = c_int
58
59
60 textcat.special_textcat_Init.argtypes = [c_char_p, c_char_p]
61 textcat.special_textcat_Init.restype = c_int
62
63
64 textcat.textcat_Done.argtypes = [c_int]
65
66
67 textcat.textcat_Classify.argtypes = [c_int, c_char_p, c_int]
68 textcat.textcat_Classify.restype = c_char_p
69
71
73 """
74 @param config: path to .conf for textcat
75 @type config: String
76 @param model_dir: path to language models
77 @type model_dir: String
78 """
79 if textcat is None:
80 return None
81 self._handle = textcat.special_textcat_Init(config, model_dir)
82
83 lang_list_re = re.compile("\[(.+?)\]+")
84
86 """Converts a text result of '[lang][lang]' into a Python list of language codes"""
87 if lang_result in ('SHORT', 'UNKNOWN'):
88 return []
89 return self.lang_list_re.findall(lang_result)
90
91 - def identify(self, text, sample_length=None):
92 """Identify the language in I{text} by sampling I{sample_length}
93
94 @param text: Text to be identified
95 @type text: String
96 @param sample_length: The amount of I{text} to be analysed
97 @type sample_length: Int
98 @return: list of language codes
99 """
100 if sample_length is None or sample_length > len(text):
101 sample_length = len(text)
102 if isinstance(text, unicode):
103 text = text.encode('utf-8')
104 matches = self._lang_result_to_list(textcat.textcat_Classify(self._handle, text, sample_length))
105 return [(simplify_to_common(match, languages), 0.8) for match in matches]
106
108 """Identify the language of a translation store
109
110 @param store: Store to be identified
111 @type store: L{TranslationStore <storage.base.TranslationStore>}
112 @param sample_length: The amount of text to be analysed
113 @type sample_length: Int
114 @return: list of language codes
115 """
116 text = ""
117 for unit in store.units:
118 text = text + unit.target
119 if sample_length is not None and len(text) >= sample:
120 break
121 return self.identify(text, sample_length)
122
124 textcat.textcat_Done(self._handle)
125