Package translate :: Package lang :: Module identify
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.identify

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2009 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """This module contains functions for identifying languages based on language models. 
 22   
 23     It wraps U{libtextcat<http://software.wise-guys.nl/libtextcat/>} to get the language 
 24     identification functionality. 
 25   
 26     To use first create an instance of I{LanguageIdentifier} and then use the methods  
 27     I{identify} or I{identify_store} to detect the language in a string or in a translation 
 28     store respectively. 
 29  """ 
 30   
 31  from ctypes import * 
 32  import ctypes.util 
 33   
 34  from translate.lang.data import * 
 35   
 36  # Load libtextcat 
 37  textcat = None 
 38  # 'textcat' is recognised on Unix, while only 'libtextcat' is recognised on 
 39  # windows. Therefore we test both. 
 40  names = ['textcat', 'libtextcat'] 
 41  for name in names: 
 42      lib_location = ctypes.util.find_library(name) 
 43      if lib_location: 
 44          textcat = cdll.LoadLibrary(lib_location) 
 45          if textcat: 
 46              break 
 47  else: 
 48      # Now we are getting desperate, so let's guess a unix type DLL that might  
 49      # be in LD_LIBRARY_PATH or loaded with LD_PRELOAD 
 50      try: 
 51          textcat = cdll.LoadLibrary('libtextcat.so') 
 52      except OSError, e: 
 53          raise ImportError("textcat library not found") 
 54   
 55  # Original initialisation 
 56  textcat.textcat_Init.argtypes = [c_char_p] 
 57  textcat.textcat_Init.retype = c_int 
 58   
 59  # Initialisation used in OpenOffice.org modification which allows the models to be in a different directory 
 60  textcat.special_textcat_Init.argtypes = [c_char_p, c_char_p] 
 61  textcat.special_textcat_Init.restype = c_int 
 62   
 63  # Cleans up textcat 
 64  textcat.textcat_Done.argtypes = [c_int] 
 65   
 66  # Perform language guessing 
 67  textcat.textcat_Classify.argtypes = [c_int, c_char_p, c_int] 
 68  textcat.textcat_Classify.restype = c_char_p 
 69   
70 -class LanguageIdentifier(object):
71
72 - def __init__(self, config, model_dir):
73 """ 74 @param config: path to .conf for textcat 75 @type config: String 76 @param model_dir: path to language models 77 @type model_dir: String 78 """ 79 if textcat is None: 80 return None 81 self._handle = textcat.special_textcat_Init(config, model_dir)
82 83 lang_list_re = re.compile("\[(.+?)\]+") 84
85 - def _lang_result_to_list(self, lang_result):
86 """Converts a text result of '[lang][lang]' into a Python list of language codes""" 87 if lang_result in ('SHORT', 'UNKNOWN'): 88 return [] 89 return self.lang_list_re.findall(lang_result)
90
91 - def identify(self, text, sample_length=None):
92 """Identify the language in I{text} by sampling I{sample_length} 93 94 @param text: Text to be identified 95 @type text: String 96 @param sample_length: The amount of I{text} to be analysed 97 @type sample_length: Int 98 @return: list of language codes 99 """ 100 if sample_length is None or sample_length > len(text): 101 sample_length = len(text) 102 if isinstance(text, unicode): 103 text = text.encode('utf-8') 104 matches = self._lang_result_to_list(textcat.textcat_Classify(self._handle, text, sample_length)) 105 return [(simplify_to_common(match, languages), 0.8) for match in matches]
106
107 - def identify_store(self, store, sample_length=None):
108 """Identify the language of a translation store 109 110 @param store: Store to be identified 111 @type store: L{TranslationStore <storage.base.TranslationStore>} 112 @param sample_length: The amount of text to be analysed 113 @type sample_length: Int 114 @return: list of language codes 115 """ 116 text = "" 117 for unit in store.units: 118 text = text + unit.target 119 if sample_length is not None and len(text) >= sample: 120 break 121 return self.identify(text, sample_length)
122
123 - def __del__(self):
124 textcat.textcat_Done(self._handle)
125