1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 interface to the xapian indexing engine for the translate toolkit
25
26 Xapian v1.0 or higher is supported.
27
28 If you are interested in writing an interface for Xapian 0.x, then
29 you should checkout the following::
30 svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
31 It is not completely working, but it should give you a good start.
32 """
33
34 __revision__ = "$Id: XapianIndexer.py 10834 2009-04-09 14:40:06Z alaaosh $"
35
36
37
38 import sys
39 if 'apache' in sys.modules or '_apache' in sys.modules:
40 raise ImportError("Running under mod_python, can't load xapian")
41
42 import CommonIndexer
43 import xapian
44 import os
45 import re
46
47
49 return xapian.major_version() > 0
50
51
52
53
54
55 _MAX_TERM_LENGTH = 128
56
57
59 """interface to the xapian (http://xapian.org) indexer
60 """
61
62 QUERY_TYPE = xapian.Query
63 INDEX_DIRECTORY_NAME = "xapian"
64
65 - def __init__(self, basedir, analyzer=None, create_allowed=True):
66 """initialize or open a xapian database
67
68 @raise ValueError: the given location exists, but the database type
69 is incompatible (e.g. created by a different indexing engine)
70 @raise OSError: the database failed to initialize
71
72 @param basedir: the parent directory of the database
73 @type basedir: str
74 @param analyzer: bitwise combination of possible analyzer flags
75 to be used as the default analyzer for this database. Leave it empty
76 to use the system default analyzer (self.ANALYZER_DEFAULT).
77 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
78 @type analyzer: int
79 @param create_allowed: create the database, if necessary; default: True
80 @type create_allowed: bool
81 """
82
83 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer,
84 create_allowed=create_allowed)
85 if os.path.exists(self.location):
86
87 try:
88 self.database = xapian.WritableDatabase(self.location,
89 xapian.DB_OPEN)
90 except xapian.DatabaseOpeningError, err_msg:
91 raise ValueError("Indexer: failed to open xapian database " \
92 + "(%s) - maybe it is not a xapian database: %s" \
93 % (self.location, err_msg))
94 else:
95
96 if not create_allowed:
97 raise OSError("Indexer: skipping database creation")
98 try:
99
100 parent_path = os.path.dirname(self.location)
101 if not os.path.isdir(parent_path):
102
103 os.makedirs(parent_path)
104 except IOError, err_msg:
105 raise OSError("Indexer: failed to create the parent " \
106 + "directory (%s) of the indexing database: %s" \
107 % (parent_path, err_msg))
108 try:
109 self.database = xapian.WritableDatabase(self.location,
110 xapian.DB_CREATE_OR_OPEN)
111 except xapian.DatabaseOpeningError, err_msg:
112 raise OSError("Indexer: failed to open or create a xapian " \
113 + "database (%s): %s" % (self.location, err_msg))
114
115 - def flush(self, optimize=False):
116 """force to write the current changes to disk immediately
117
118 @param optimize: ignored for xapian
119 @type optimize: bool
120 """
121
122 if (isinstance(self.database, xapian.WritableDatabase)):
123 self.database.flush()
124
125 self.database = None
126
127 self._prepare_database()
128
130 """generate a query based on an existing query object
131
132 basically this function should just create a copy of the original
133
134 @param query: the original query object
135 @type query: xapian.Query
136 @return: the resulting query object
137 @rtype: xapian.Query
138 """
139
140 return xapian.Query(query)
141
144 """generate a query for a plain term of a string query
145
146 basically this function parses the string and returns the resulting
147 query
148
149 @param text: the query string
150 @type text: str
151 @param require_all: boolean operator
152 (True -> AND (default) / False -> OR)
153 @type require_all: bool
154 @param analyzer: Define query options (partial matching, exact matching,
155 tokenizing, ...) as bitwise combinations of
156 CommonIndexer.ANALYZER_???.
157 This can override previously defined field analyzer settings.
158 If analyzer is None (default), then the configured analyzer for the
159 field is used.
160 @type analyzer: int
161 @return: resulting query object
162 @rtype: xapian.Query
163 """
164 qp = xapian.QueryParser()
165 qp.set_database(self.database)
166 if require_all:
167 qp.set_default_op(xapian.Query.OP_AND)
168 else:
169 qp.set_default_op(xapian.Query.OP_OR)
170 if analyzer is None:
171 analyzer = self.analyzer
172 if analyzer & self.ANALYZER_PARTIAL > 0:
173 match_flags = xapian.QueryParser.FLAG_PARTIAL
174 return qp.parse_query(text, match_flags)
175 elif analyzer == self.ANALYZER_EXACT:
176
177 return xapian.Query(text)
178 else:
179
180 match_flags = 0
181 return qp.parse_query(text, match_flags)
182
184 """generate a field query
185
186 this functions creates a field->value query
187
188 @param field: the fieldname to be used
189 @type field: str
190 @param value: the wanted value of the field
191 @type value: str
192 @param analyzer: Define query options (partial matching, exact matching,
193 tokenizing, ...) as bitwise combinations of
194 CommonIndexer.ANALYZER_???.
195 This can override previously defined field analyzer settings.
196 If analyzer is None (default), then the configured analyzer for the
197 field is used.
198 @type analyzer: int
199 @return: the resulting query object
200 @rtype: xapian.Query
201 """
202 if analyzer is None:
203 analyzer = self.analyzer
204 if analyzer == self.ANALYZER_EXACT:
205
206 return xapian.Query("%s%s" % (field.upper(), value))
207
208 qp = xapian.QueryParser()
209 qp.set_database(self.database)
210 if (analyzer & self.ANALYZER_PARTIAL > 0):
211
212 match_flags = xapian.QueryParser.FLAG_PARTIAL
213 return qp.parse_query(value, match_flags, field.upper())
214 else:
215
216 match_flags = 0
217 return qp.parse_query(value, match_flags, field.upper())
218
220 """generate a combined query
221
222 @param queries: list of the original queries
223 @type queries: list of xapian.Query
224 @param require_all: boolean operator
225 (True -> AND (default) / False -> OR)
226 @type require_all: bool
227 @return: the resulting combined query object
228 @rtype: xapian.Query
229 """
230 if require_all:
231 query_op = xapian.Query.OP_AND
232 else:
233 query_op = xapian.Query.OP_OR
234 return xapian.Query(query_op, queries)
235
237 """create an empty document to be filled and added to the index later
238
239 @return: the new document object
240 @rtype: xapian.Document
241 """
242 return xapian.Document()
243
245 """add a term to a document
246
247 @param document: the document to be changed
248 @type document: xapian.Document
249 @param term: a single term to be added
250 @type term: str
251 @param tokenize: should the term be tokenized automatically
252 @type tokenize: bool
253 """
254 if tokenize:
255 term_gen = xapian.TermGenerator()
256 term_gen.set_document(document)
257 term_gen.index_text(term)
258 else:
259 document.add_term(_truncate_term_length(term))
260
262 """add a field term to a document
263
264 @param document: the document to be changed
265 @type document: xapian.Document
266 @param field: name of the field
267 @type field: str
268 @param term: term to be associated to the field
269 @type term: str
270 @param tokenize: should the term be tokenized automatically
271 @type tokenize: bool
272 """
273 if tokenize:
274 term_gen = xapian.TermGenerator()
275 term_gen.set_document(document)
276 term_gen.index_text(term, 1, field.upper())
277 else:
278 document.add_term(_truncate_term_length("%s%s" % \
279 (field.upper(), term)))
280
282 """add a prepared document to the index database
283
284 @param document: the document to be added
285 @type document: xapian.Document
286 """
287
288 self._prepare_database(writable=True)
289 self.database.add_document(document)
290
292 """begin a transaction
293
294 Xapian supports transactions to group multiple database modifications.
295 This avoids intermediate flushing and therefore increases performance.
296 """
297 self._prepare_database(writable=True)
298 self.database.begin_transaction()
299
301 """cancel an ongoing transaction
302
303 no changes since the last execution of 'begin_transcation' are written
304 """
305 self._prepare_database(writable=True)
306 self.database.cancel_transaction()
307
309 """submit the changes of an ongoing transaction
310
311 all changes since the last execution of 'begin_transaction' are written
312 """
313 self._prepare_database(writable=True)
314 self.database.commit_transaction()
315
317 """return an object containing the results of a query
318
319 @param query: a pre-compiled xapian query
320 @type query: xapian.Query
321 @return: an object that allows access to the results
322 @rtype: XapianIndexer.CommonEnquire
323 """
324 enquire = xapian.Enquire(self.database)
325 enquire.set_query(query)
326 return XapianEnquire(enquire)
327
329 """delete a specified document
330
331 @param docid: the document ID to be deleted
332 @type docid: int
333 """
334
335 self._prepare_database(writable=True)
336 try:
337 self.database.delete_document(docid)
338 return True
339 except xapian.DocNotFoundError:
340 return False
341
342 - def search(self, query, fieldnames):
343 """return a list of the contents of specified fields for all matches of
344 a query
345
346 @param query: the query to be issued
347 @type query: xapian.Query
348 @param fieldnames: the name(s) of a field of the document content
349 @type fieldnames: string | list of strings
350 @return: a list of dicts containing the specified field(s)
351 @rtype: list of dicts
352 """
353 result = []
354 if isinstance(fieldnames, basestring):
355 fieldnames = [fieldnames]
356 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
357 return result
358
360 """reopen the database as read-only or as writable if necessary
361
362 this fixes a xapian specific issue regarding open locks for
363 writable databases
364
365 @param writable: True for opening a writable database
366 @type writable: bool
367 """
368 if writable and (not isinstance(self.database,
369 xapian.WritableDatabase)):
370 self.database = xapian.WritableDatabase(self.location,
371 xapian.DB_OPEN)
372 elif not writable and (not isinstance(self.database, xapian.Database)):
373 self.database = xapian.Database(self.location)
374
375
377 """interface to the xapian object for storing sets of matches
378 """
379
381 """return a specified number of qualified matches of a previous query
382
383 @param start: index of the first match to return (starting from zero)
384 @type start: int
385 @param number: the number of matching entries to return
386 @type number: int
387 @return: a set of matching entries and some statistics
388 @rtype: tuple of (returned number, available number, matches)
389 "matches" is a dictionary of::
390 ["rank", "percent", "document", "docid"]
391 """
392 matches = self.enquire.get_mset(start, number)
393 result = []
394 for match in matches:
395 elem = {}
396 elem["rank"] = match[xapian.MSET_RANK]
397 elem["docid"] = match[xapian.MSET_DID]
398 elem["percent"] = match[xapian.MSET_PERCENT]
399 elem["document"] = match[xapian.MSET_DOCUMENT]
400 result.append(elem)
401 return (matches.size(), matches.get_matches_estimated(), result)
402
403
405 """truncate the length of a term string length to the maximum allowed
406 for xapian terms
407
408 @param term: the value of the term, that should be truncated
409 @type term: str
410 @param taken: since a term consists of the name of the term and its
411 actual value, this additional parameter can be used to reduce the
412 maximum count of possible characters
413 @type taken: int
414 @return: the truncated string
415 @rtype: str
416 """
417 if len(term) > _MAX_TERM_LENGTH - taken:
418 return term[0:_MAX_TERM_LENGTH - taken - 1]
419 else:
420 return term
421
423 """add a dict of field values to a list
424
425 usually this function should be used together with '_walk_matches'
426 for traversing a list of matches
427 @param match: a single match object
428 @type match: xapian.MSet
429 @param result: the resulting dict will be added to this list
430 @type result: list of dict
431 @param fieldnames: the names of the fields to be added to the dict
432 @type fieldnames: list of str
433 """
434
435 item_fields = {}
436
437 for term in match["document"].termlist():
438 for fname in fieldnames:
439 if ((fname is None) and re.match("[^A-Z]", term.term)):
440 value = term.term
441 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term):
442 value = term.term[len(fname):]
443 else:
444 continue
445
446 if item_fields.has_key(fname):
447 item_fields[fname].append(value)
448 else:
449 item_fields[fname] = [value]
450 result.append(item_fields)
451