1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Module to provide a translation memory database."""
23 import math
24 import time
25 import logging
26 import re
27 try:
28 from sqlite3 import dbapi2
29 except ImportError:
30 from pysqlite2 import dbapi2
31
32 from translate.search.lshtein import LevenshteinComparer
33 from translate.lang import data
34
35
36 STRIP_REGEXP = re.compile("\W", re.UNICODE)
37
41
43 return str(self.value)
44
45
47 _tm_dbs = {}
48 - def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000):
49
50 self.max_candidates = max_candidates
51 self.min_similarity = min_similarity
52 self.max_length = max_length
53
54
55 if not self._tm_dbs.has_key(db_file):
56 self._tm_dbs[db_file] = dbapi2.connect(db_file)
57
58 self.connection = self._tm_dbs[db_file]
59 self.cursor = self.connection.cursor()
60
61
62 self.init_database()
63 self.fulltext = False
64 self.init_fulltext()
65
66 self.comparer = LevenshteinComparer(self.max_length)
67
68 self.preload_db()
69
71 """creates database tables and indices"""
72
73 script = """
74 CREATE TABLE IF NOT EXISTS sources (
75 sid INTEGER PRIMARY KEY AUTOINCREMENT,
76 text VARCHAR NOT NULL,
77 context VARCHAR DEFAULT NULL,
78 lang VARCHAR NOT NULL,
79 length INTEGER NOT NULL
80 );
81 CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context);
82 CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang);
83 CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length);
84 CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang);
85
86 CREATE TABLE IF NOT EXISTS targets (
87 tid INTEGER PRIMARY KEY AUTOINCREMENT,
88 sid INTEGER NOT NULL,
89 text VARCHAR NOT NULL,
90 lang VARCHAR NOT NULL,
91 time INTEGER DEFAULT NULL,
92 FOREIGN KEY (sid) references sources(sid)
93 );
94 CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid);
95 CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang);
96 CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time);
97 CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang);
98 """
99
100 try:
101 self.cursor.executescript(script)
102 self.connection.commit()
103 except:
104 self.connection.rollback()
105 raise
106
107 - def init_fulltext(self):
108 """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does"""
109
110
111 try:
112 script = """
113 DROP TABLE IF EXISTS test_for_fts3;
114 CREATE VIRTUAL TABLE test_for_fts3 USING fts3;
115 DROP TABLE test_for_fts3;
116 """
117 self.cursor.executescript(script)
118 logging.debug("fts3 supported")
119
120
121 self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'")
122 if not self.cursor.fetchone():
123
124 script = """
125 CREATE VIRTUAL TABLE fulltext USING fts3(text);
126 """
127 logging.debug("fulltext table not exists, creating")
128 self.cursor.executescript(script)
129 logging.debug("created fulltext table")
130 else:
131 logging.debug("fulltext table already exists")
132
133
134 script = """
135 INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext);
136 CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW
137 BEGIN
138 INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text);
139 END;
140 CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW
141 BEGIN
142 UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid;
143 END;
144 CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW
145 BEGIN
146 DELETE FROM fulltext WHERE docid = OLD.sid;
147 END;
148 """
149 self.cursor.executescript(script)
150 self.connection.commit()
151 logging.debug("created fulltext triggers")
152 self.fulltext = True
153
154 except dbapi2.OperationalError, e:
155 self.fulltext = False
156 logging.debug("failed to initialize fts3 support: " + str(e))
157 script = """
158 DROP TRIGGER IF EXISTS sources_insert_trig;
159 DROP TRIGGER IF EXISTS sources_update_trig;
160 DROP TRIGGER IF EXISTS sources_delete_trig;
161 """
162 self.cursor.executescript(script)
163
165 """ugly hack to force caching of sqlite db file in memory for
166 improved performance"""
167 if self.fulltext:
168 query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid"""
169 else:
170 query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid"""
171 self.cursor.execute(query)
172 (numrows,) = self.cursor.fetchone()
173 logging.debug("tmdb has %d records" % numrows)
174 return numrows
175
176 - def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
196
197 - def add_dict(self, unit, source_lang, target_lang, commit=True):
198 """inserts units represented as dictionaries in database"""
199 source_lang = data.normalize_code(source_lang)
200 target_lang = data.normalize_code(target_lang)
201 try:
202 try:
203 self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)",
204 (unit["source"],
205 unit["context"],
206 source_lang,
207 len(unit["source"])))
208 sid = self.cursor.lastrowid
209 except dbapi2.IntegrityError:
210
211 self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?",
212 (unit["source"],
213 unit["context"],
214 source_lang))
215 sid = self.cursor.fetchone()
216 (sid,) = sid
217 try:
218
219
220 self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)",
221 (sid,
222 unit["target"],
223 target_lang,
224 int(time.time())))
225 except dbapi2.IntegrityError:
226
227 pass
228
229 if commit:
230 self.connection.commit()
231 except:
232 if commit:
233 self.connection.rollback()
234 raise
235
236 - def add_store(self, store, source_lang, target_lang, commit=True):
246
247 - def add_list(self, units, source_lang, target_lang, commit=True):
248 """insert all units in list into the database, units are
249 represented as dictionaries"""
250 count = 0
251 for unit in units:
252 self.add_dict(unit, source_lang, target_lang, commit=False)
253 count += 1
254 if commit:
255 self.connection.commit()
256 return count
257
259 """return TM suggestions for unit_source"""
260 if isinstance(unit_source, str):
261 unit_source = unicode(unit_source, "utf-8")
262 if isinstance(source_langs, list):
263 source_langs = [data.normalize_code(lang) for lang in source_langs]
264 source_langs = ','.join(source_langs)
265 else:
266 source_langs = data.normalize_code(source_langs)
267 if isinstance(target_langs, list):
268 target_langs = [data.normalize_code(lang) for lang in target_langs]
269 target_langs = ','.join(target_langs)
270 else:
271 target_langs = data.normalize_code(target_langs)
272
273 minlen = min_levenshtein_length(len(unit_source), self.min_similarity)
274 maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length)
275
276
277
278 unit_words = STRIP_REGEXP.sub(' ', unit_source).split()
279 unit_words = filter(lambda word: len(word) > 2, unit_words)
280
281 if self.fulltext and len(unit_words) > 3:
282 logging.debug("fulltext matching")
283 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid
284 WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ?
285 AND fulltext MATCH ?"""
286 search_str = " OR ".join(unit_words)
287 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str))
288 else:
289 logging.debug("nonfulltext matching")
290 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid
291 WHERE s.lang IN (?) AND t.lang IN (?)
292 AND s.length >= ? AND s.length <= ?"""
293 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen))
294
295 results = []
296 for row in self.cursor:
297 result = {}
298 result['source'] = row[0]
299 result['target'] = row[1]
300 result['context'] = row[2]
301 result['quality'] = self.comparer.similarity(unit_source, result['source'], self.min_similarity)
302 if result['quality'] >= self.min_similarity:
303 results.append(result)
304 results.sort(key=lambda match: match['quality'], reverse=True)
305 results = results[:self.max_candidates]
306 logging.debug("results: %s", unicode(results))
307 return results
308
309
311 return math.ceil(max(length * (min_similarity/100.0), 2))
312
314 return math.floor(min(length / (min_similarity/100.0), max_length))
315