Package translate :: Package storage :: Module tmdb
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.tmdb

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Module to provide a translation memory database.""" 
 23  import math 
 24  import time 
 25  import logging 
 26  import re 
 27  try: 
 28      from sqlite3 import dbapi2 
 29  except ImportError: 
 30      from pysqlite2 import dbapi2 
 31   
 32  from translate.search.lshtein import LevenshteinComparer 
 33  from translate.lang import data 
 34   
 35   
 36  STRIP_REGEXP = re.compile("\W", re.UNICODE) 
 37   
38 -class LanguageError(Exception):
39 - def __init__(self, value):
40 self.value = value
41
42 - def __str__(self):
43 return str(self.value)
44 45
46 -class TMDB(object):
47 _tm_dbs = {}
48 - def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000):
49 50 self.max_candidates = max_candidates 51 self.min_similarity = min_similarity 52 self.max_length = max_length 53 54 # share connections to same database file between different instances 55 if not self._tm_dbs.has_key(db_file): 56 self._tm_dbs[db_file] = dbapi2.connect(db_file) 57 58 self.connection = self._tm_dbs[db_file] 59 self.cursor = self.connection.cursor() 60 61 #FIXME: do we want to do any checks before we initialize the DB? 62 self.init_database() 63 self.fulltext = False 64 self.init_fulltext() 65 66 self.comparer = LevenshteinComparer(self.max_length) 67 68 self.preload_db()
69
70 - def init_database(self):
71 """creates database tables and indices""" 72 73 script = """ 74 CREATE TABLE IF NOT EXISTS sources ( 75 sid INTEGER PRIMARY KEY AUTOINCREMENT, 76 text VARCHAR NOT NULL, 77 context VARCHAR DEFAULT NULL, 78 lang VARCHAR NOT NULL, 79 length INTEGER NOT NULL 80 ); 81 CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context); 82 CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang); 83 CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length); 84 CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang); 85 86 CREATE TABLE IF NOT EXISTS targets ( 87 tid INTEGER PRIMARY KEY AUTOINCREMENT, 88 sid INTEGER NOT NULL, 89 text VARCHAR NOT NULL, 90 lang VARCHAR NOT NULL, 91 time INTEGER DEFAULT NULL, 92 FOREIGN KEY (sid) references sources(sid) 93 ); 94 CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid); 95 CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang); 96 CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time); 97 CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang); 98 """ 99 100 try: 101 self.cursor.executescript(script) 102 self.connection.commit() 103 except: 104 self.connection.rollback() 105 raise
106
107 - def init_fulltext(self):
108 """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does""" 109 110 #HACKISH: no better way to detect fts3 support except trying to construct a dummy table?! 111 try: 112 script = """ 113 DROP TABLE IF EXISTS test_for_fts3; 114 CREATE VIRTUAL TABLE test_for_fts3 USING fts3; 115 DROP TABLE test_for_fts3; 116 """ 117 self.cursor.executescript(script) 118 logging.debug("fts3 supported") 119 # for some reason CREATE VIRTUAL TABLE doesn't support IF NOT EXISTS syntax 120 # check if fulltext index table exists manually 121 self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'") 122 if not self.cursor.fetchone(): 123 # create fulltext index table, and index all strings in sources 124 script = """ 125 CREATE VIRTUAL TABLE fulltext USING fts3(text); 126 """ 127 logging.debug("fulltext table not exists, creating") 128 self.cursor.executescript(script) 129 logging.debug("created fulltext table") 130 else: 131 logging.debug("fulltext table already exists") 132 133 # create triggers that would sync sources table with fulltext index 134 script = """ 135 INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext); 136 CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW 137 BEGIN 138 INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text); 139 END; 140 CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW 141 BEGIN 142 UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid; 143 END; 144 CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW 145 BEGIN 146 DELETE FROM fulltext WHERE docid = OLD.sid; 147 END; 148 """ 149 self.cursor.executescript(script) 150 self.connection.commit() 151 logging.debug("created fulltext triggers") 152 self.fulltext = True 153 154 except dbapi2.OperationalError, e: 155 self.fulltext = False 156 logging.debug("failed to initialize fts3 support: " + str(e)) 157 script = """ 158 DROP TRIGGER IF EXISTS sources_insert_trig; 159 DROP TRIGGER IF EXISTS sources_update_trig; 160 DROP TRIGGER IF EXISTS sources_delete_trig; 161 """ 162 self.cursor.executescript(script)
163
164 - def preload_db(self):
165 """ugly hack to force caching of sqlite db file in memory for 166 improved performance""" 167 if self.fulltext: 168 query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid""" 169 else: 170 query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid""" 171 self.cursor.execute(query) 172 (numrows,) = self.cursor.fetchone() 173 logging.debug("tmdb has %d records" % numrows) 174 return numrows
175
176 - def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
177 """inserts unit in the database""" 178 #TODO: is that really the best way to handle unspecified 179 # source and target languages? what about conflicts between 180 # unit attributes and passed arguments 181 if unit.getsourcelanguage(): 182 source_lang = unit.getsourcelanguage() 183 if unit.gettargetlanguage(): 184 target_lang = unit.gettargetlanguage() 185 186 if not source_lang: 187 raise LanguageError("undefined source language") 188 if not target_lang: 189 raise LanguageError("undefined target language") 190 191 unitdict = {"source" : unit.source, 192 "target" : unit.target, 193 "context": unit.getcontext() 194 } 195 self.add_dict(unitdict, source_lang, target_lang, commit)
196
197 - def add_dict(self, unit, source_lang, target_lang, commit=True):
198 """inserts units represented as dictionaries in database""" 199 source_lang = data.normalize_code(source_lang) 200 target_lang = data.normalize_code(target_lang) 201 try: 202 try: 203 self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)", 204 (unit["source"], 205 unit["context"], 206 source_lang, 207 len(unit["source"]))) 208 sid = self.cursor.lastrowid 209 except dbapi2.IntegrityError: 210 # source string already exists in db, run query to find sid 211 self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?", 212 (unit["source"], 213 unit["context"], 214 source_lang)) 215 sid = self.cursor.fetchone() 216 (sid,) = sid 217 try: 218 #FIXME: get time info from translation store 219 #FIXME: do we need so store target length? 220 self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)", 221 (sid, 222 unit["target"], 223 target_lang, 224 int(time.time()))) 225 except dbapi2.IntegrityError: 226 # target string already exists in db, do nothing 227 pass 228 229 if commit: 230 self.connection.commit() 231 except: 232 if commit: 233 self.connection.rollback() 234 raise
235
236 - def add_store(self, store, source_lang, target_lang, commit=True):
237 """insert all units in store in database""" 238 count = 0 239 for unit in store.units: 240 if unit.istranslatable() and unit.istranslated(): 241 self.add_unit(unit, source_lang, target_lang, commit=False) 242 count += 1 243 if commit: 244 self.connection.commit() 245 return count
246
247 - def add_list(self, units, source_lang, target_lang, commit=True):
248 """insert all units in list into the database, units are 249 represented as dictionaries""" 250 count = 0 251 for unit in units: 252 self.add_dict(unit, source_lang, target_lang, commit=False) 253 count += 1 254 if commit: 255 self.connection.commit() 256 return count
257
258 - def translate_unit(self, unit_source, source_langs, target_langs):
259 """return TM suggestions for unit_source""" 260 if isinstance(unit_source, str): 261 unit_source = unicode(unit_source, "utf-8") 262 if isinstance(source_langs, list): 263 source_langs = [data.normalize_code(lang) for lang in source_langs] 264 source_langs = ','.join(source_langs) 265 else: 266 source_langs = data.normalize_code(source_langs) 267 if isinstance(target_langs, list): 268 target_langs = [data.normalize_code(lang) for lang in target_langs] 269 target_langs = ','.join(target_langs) 270 else: 271 target_langs = data.normalize_code(target_langs) 272 273 minlen = min_levenshtein_length(len(unit_source), self.min_similarity) 274 maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length) 275 276 # split source into words, remove punctuation and special 277 # chars, keep words that are at least 3 chars long 278 unit_words = STRIP_REGEXP.sub(' ', unit_source).split() 279 unit_words = filter(lambda word: len(word) > 2, unit_words) 280 281 if self.fulltext and len(unit_words) > 3: 282 logging.debug("fulltext matching") 283 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid 284 WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ? 285 AND fulltext MATCH ?""" 286 search_str = " OR ".join(unit_words) 287 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str)) 288 else: 289 logging.debug("nonfulltext matching") 290 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid 291 WHERE s.lang IN (?) AND t.lang IN (?) 292 AND s.length >= ? AND s.length <= ?""" 293 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen)) 294 295 results = [] 296 for row in self.cursor: 297 result = {} 298 result['source'] = row[0] 299 result['target'] = row[1] 300 result['context'] = row[2] 301 result['quality'] = self.comparer.similarity(unit_source, result['source'], self.min_similarity) 302 if result['quality'] >= self.min_similarity: 303 results.append(result) 304 results.sort(key=lambda match: match['quality'], reverse=True) 305 results = results[:self.max_candidates] 306 logging.debug("results: %s", unicode(results)) 307 return results
308 309
310 -def min_levenshtein_length(length, min_similarity):
311 return math.ceil(max(length * (min_similarity/100.0), 2))
312
313 -def max_levenshtein_length(length, min_similarity, max_length):
314 return math.floor(min(length / (min_similarity/100.0), max_length))
315