1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 base class for interfaces to indexing engines for pootle
25 """
26
27 import translate.lang.data
28 import os
29
30 __revision__ = "$Id: CommonIndexer.py 8507 2008-09-27 09:15:08Z dwaynebailey $"
31
32
34 """check if this indexing engine interface is usable
35
36 this function must exist in every module that contains indexing engine
37 interfaces
38
39 @return: is this interface usable?
40 @rtype: bool
41 """
42 return False
43
44
46 """base class for indexing support
47
48 any real implementation must override most methods of this class
49 """
50
51 field_analyzers = {}
52 """mapping of field names and analyzers - see 'set_field_analyzers'"""
53
54 ANALYZER_EXACT = 0
55 """exact matching: the query string must equal the whole term string"""
56
57 ANALYZER_PARTIAL = 1<<1
58 """partial matching: a document matches, even if the query string only
59 matches the beginning of the term value."""
60
61 ANALYZER_TOKENIZE = 1<<2
62 """tokenize terms and queries automatically"""
63
64 ANALYZER_DEFAULT = ANALYZER_TOKENIZE | ANALYZER_PARTIAL
65 """the default analyzer to be used if nothing is configured"""
66
67 QUERY_TYPE = None
68 """override this with the query class of the implementation"""
69
70 INDEX_DIRECTORY_NAME = None
71 """override this with a string to be used as the name of the indexing
72 directory/file in the filesystem
73 """
74
75 - def __init__(self, basedir, analyzer=None, create_allowed=True):
76 """initialize or open an indexing database
77
78 Any derived class must override __init__.
79
80 Any implementation can rely on the "self.location" attribute to be set
81 by the __init__ function of the super class.
82
83 @raise ValueError: the given location exists, but the database type
84 is incompatible (e.g. created by a different indexing engine)
85 @raise OSError: the database failed to initialize
86
87 @param basedir: the parent directory of the database
88 @type basedir: str
89 @param analyzer: bitwise combination of possible analyzer flags
90 to be used as the default analyzer for this database. Leave it empty
91 to use the system default analyzer (self.ANALYZER_DEFAULT).
92 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
93 @type analyzer: int
94 @param create_allowed: create the database, if necessary; default: True
95 @type create_allowed: bool
96 """
97
98 if self.QUERY_TYPE is None:
99 raise NotImplementedError("Incomplete indexer implementation: " \
100 + "'QUERY_TYPE' is undefined")
101 if self.INDEX_DIRECTORY_NAME is None:
102 raise NotImplementedError("Incomplete indexer implementation: " \
103 + "'INDEX_DIRECTORY_NAME' is undefined")
104 self.location = os.path.join(basedir, self.INDEX_DIRECTORY_NAME)
105 if (not create_allowed) and (not os.path.exists(self.location)):
106 raise OSError("Indexer: the database does not exist - and I am" \
107 + " not configured to create it.")
108 if analyzer is None:
109 self.analyzer = self.ANALYZER_DEFAULT
110 else:
111 self.analyzer = analyzer
112 self.field_analyzers = {}
113
114 - def flush(self, optimize=False):
115 """flush the content of the database - to force changes to be written
116 to disk
117
118 some databases also support index optimization
119
120 @param optimize: should the index be optimized if possible?
121 @type optimize: bool
122 """
123 raise NotImplementedError("Incomplete indexer implementation: " \
124 + "'flush' is missing")
125
126 - def make_query(self, args, require_all=True, analyzer=None):
127 """create simple queries (strings or field searches) or
128 combine multiple queries (AND/OR)
129
130 To specifiy rules for field searches, you may want to take a look at
131 'set_field_analyzers'. The parameter 'match_text_partial' can override
132 the previously defined default setting.
133
134 @param args: queries or search string or description of field query
135 examples::
136 [xapian.Query("foo"), xapian.Query("bar")]
137 xapian.Query("foo")
138 "bar"
139 {"foo": "bar", "foobar": "foo"}
140 @type args: list of queries | single query | str | dict
141 @param require_all: boolean operator
142 (True -> AND (default) / False -> OR)
143 @type require_all: boolean
144 @param analyzer: (only applicable for 'dict' or 'str')
145 Define query options (partial matching, exact matching, tokenizing,
146 ...) as bitwise combinations of CommonIndexer.ANALYZER_???.
147 This can override previously defined field analyzer settings.
148 If analyzer is None (default), then the configured analyzer for the
149 field is used.
150 @type analyzer: int
151 @return: the combined query
152 @rtype: query type of the specific implemention
153 """
154
155 if isinstance(args, dict):
156 args = args.items()
157
158 if not isinstance(args, list):
159 args = [args]
160
161 result = []
162 for query in args:
163
164 if isinstance(query, self.QUERY_TYPE):
165 result.append(self._create_query_for_query(query))
166
167 elif isinstance(query, tuple):
168 field, value = query
169
170 field = translate.lang.data.normalize(unicode(field))
171 value = translate.lang.data.normalize(unicode(value))
172
173 if analyzer is None:
174 analyzer = self.get_field_analyzers(field)
175 result.append(self._create_query_for_field(field, value,
176 analyzer=analyzer))
177
178 elif isinstance(query, basestring):
179 if analyzer is None:
180 analyzer = self.analyzer
181
182 query = translate.lang.data.normalize(unicode(query))
183 result.append(self._create_query_for_string(query,
184 require_all=require_all, analyzer=analyzer))
185 else:
186
187 raise ValueError("Unable to handle query type: %s" \
188 % str(type(query)))
189
190 return self._create_query_combined(result, require_all)
191
193 """generate a query based on an existing query object
194
195 basically this function should just create a copy of the original
196
197 @param query: the original query object
198 @type query: xapian.Query
199 @return: the resulting query object
200 @rtype: xapian.Query | PyLucene.Query
201 """
202 raise NotImplementedError("Incomplete indexer implementation: " \
203 + "'_create_query_for_query' is missing")
204
207 """generate a query for a plain term of a string query
208
209 basically this function parses the string and returns the resulting
210 query
211
212 @param text: the query string
213 @type text: str
214 @param require_all: boolean operator
215 (True -> AND (default) / False -> OR)
216 @type require_all: bool
217 @param analyzer: Define query options (partial matching, exact matching,
218 tokenizing, ...) as bitwise combinations of
219 CommonIndexer.ANALYZER_???.
220 This can override previously defined field analyzer settings.
221 If analyzer is None (default), then the configured analyzer for the
222 field is used.
223 @type analyzer: int
224 @return: resulting query object
225 @rtype: xapian.Query | PyLucene.Query
226 """
227 raise NotImplementedError("Incomplete indexer implementation: " \
228 + "'_create_query_for_string' is missing")
229
231 """generate a field query
232
233 this functions creates a field->value query
234
235 @param field: the fieldname to be used
236 @type field: str
237 @param value: the wanted value of the field
238 @type value: str
239 @param analyzer: Define query options (partial matching, exact matching,
240 tokenizing, ...) as bitwise combinations of
241 CommonIndexer.ANALYZER_???.
242 This can override previously defined field analyzer settings.
243 If analyzer is None (default), then the configured analyzer for the
244 field is used.
245 @type analyzer: int
246 @return: resulting query object
247 @rtype: xapian.Query | PyLucene.Query
248 """
249 raise NotImplementedError("Incomplete indexer implementation: " \
250 + "'_create_query_for_field' is missing")
251
253 """generate a combined query
254
255 @param queries: list of the original queries
256 @type queries: list of xapian.Query
257 @param require_all: boolean operator
258 (True -> AND (default) / False -> OR)
259 @type require_all: bool
260 @return: the resulting combined query object
261 @rtype: xapian.Query | PyLucene.Query
262 """
263 raise NotImplementedError("Incomplete indexer implementation: " \
264 + "'_create_query_combined' is missing")
265
267 """add the given data to the database
268
269 @param data: the data to be indexed.
270 A dictionary will be treated as fieldname:value combinations.
271 If the fieldname is None then the value will be interpreted as a
272 plain term or as a list of plain terms.
273 Lists of terms are indexed separately.
274 Lists of strings are treated as plain terms.
275 @type data: dict | list of str
276 """
277 doc = self._create_empty_document()
278 if isinstance(data, dict):
279 data = data.items()
280
281 for dataset in data:
282 if isinstance(dataset, tuple):
283
284 key, value = dataset
285 if key is None:
286 if isinstance(value, list):
287 terms = value[:]
288 elif isinstance(value, basestring):
289 terms = [value]
290 else:
291 raise ValueError("Invalid data type to be indexed: %s" \
292 % str(type(data)))
293 for one_term in terms:
294 self._add_plain_term(doc, self._decode(one_term),
295 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
296 else:
297 analyze_settings = self.get_field_analyzers(key)
298
299 if not isinstance(value, list):
300 value = [value]
301 for one_term in value:
302 self._add_field_term(doc, key, self._decode(one_term),
303 (analyze_settings & self.ANALYZER_TOKENIZE > 0))
304 elif isinstance(dataset, basestring):
305 self._add_plain_term(doc, self._decode(dataset),
306 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
307 else:
308 raise ValueError("Invalid data type to be indexed: %s" \
309 % str(type(data)))
310 self._add_document_to_index(doc)
311
313 """create an empty document to be filled and added to the index later
314
315 @return: the new document object
316 @rtype: xapian.Document | PyLucene.Document
317 """
318 raise NotImplementedError("Incomplete indexer implementation: " \
319 + "'_create_empty_document' is missing")
320
322 """add a term to a document
323
324 @param document: the document to be changed
325 @type document: xapian.Document | PyLucene.Document
326 @param term: a single term to be added
327 @type term: str
328 @param tokenize: should the term be tokenized automatically
329 @type tokenize: bool
330 """
331 raise NotImplementedError("Incomplete indexer implementation: " \
332 + "'_add_plain_term' is missing")
333
335 """add a field term to a document
336
337 @param document: the document to be changed
338 @type document: xapian.Document | PyLucene.Document
339 @param field: name of the field
340 @type field: str
341 @param term: term to be associated to the field
342 @type term: str
343 @param tokenize: should the term be tokenized automatically
344 @type tokenize: bool
345 """
346 raise NotImplementedError("Incomplete indexer implementation: " \
347 + "'_add_field_term' is missing")
348
350 """add a prepared document to the index database
351
352 @param document: the document to be added
353 @type document: xapian.Document | PyLucene.Document
354 """
355 raise NotImplementedError("Incomplete indexer implementation: " \
356 + "'_add_document_to_index' is missing")
357
359 """begin a transaction
360
361 You can group multiple modifications of a database as a transaction.
362 This prevents time-consuming database flushing and helps, if you want
363 that a changeset is committed either completely or not at all.
364 No changes will be written to disk until 'commit_transaction'.
365 'cancel_transaction' can be used to revert an ongoing transaction.
366
367 Database types that do not support transactions may silently ignore it.
368 """
369 raise NotImplementedError("Incomplete indexer implementation: " \
370 + "'begin_transaction' is missing")
371
373 """cancel an ongoing transaction
374
375 See 'start_transaction' for details.
376 """
377 raise NotImplementedError("Incomplete indexer implementation: " \
378 + "'cancel_transaction' is missing")
379
381 """submit the currently ongoing transaction and write changes to disk
382
383 See 'start_transaction' for details.
384 """
385 raise NotImplementedError("Incomplete indexer implementation: " \
386 + "'commit_transaction' is missing")
387
389 """return an object containing the results of a query
390
391 @param query: a pre-compiled query
392 @type query: a query object of the real implementation
393 @return: an object that allows access to the results
394 @rtype: subclass of CommonEnquire
395 """
396 raise NotImplementedError("Incomplete indexer implementation: " \
397 + "'get_query_result' is missing")
398
400 """delete a specified document
401
402 @param docid: the document ID to be deleted
403 @type docid: int
404 """
405 raise NotImplementedError("Incomplete indexer implementation: " \
406 + "'delete_document_by_id' is missing")
407
408 - def search(self, query, fieldnames):
409 """return a list of the contents of specified fields for all matches of
410 a query
411
412 @param query: the query to be issued
413 @type query: a query object of the real implementation
414 @param fieldnames: the name(s) of a field of the document content
415 @type fieldnames: string | list of strings
416 @return: a list of dicts containing the specified field(s)
417 @rtype: list of dicts
418 """
419 raise NotImplementedError("Incomplete indexer implementation: " \
420 + "'search' is missing")
421
423 """delete the documents returned by a query
424
425 @param ident: [list of] document IDs | dict describing a query | query
426 @type ident: int | list of tuples | dict | list of dicts |
427 query (e.g. xapian.Query) | list of queries
428 """
429
430 if isinstance(ident, list):
431
432 ident_list = ident
433 else:
434 ident_list = [ident]
435 if len(ident_list) == 0:
436
437 return 0
438 if isinstance(ident_list[0], int):
439
440 success_delete = [match for match in ident_list
441 if self.delete_document_by_id(match)]
442 return len(success_delete)
443 if isinstance(ident_list[0], dict):
444
445
446 query = self.make_query([self.make_query(query_dict,
447 require_all=True) for query_dict in ident_list],
448 require_all=True)
449 elif isinstance(ident_list[0], object):
450
451 query = self.make_query(ident_list, require_all=True)
452 else:
453
454
455 raise TypeError("description of documents to-be-deleted is not " \
456 + "supported: list of %s" % type(ident_list[0]))
457
458
459 remove_list = []
460
461 def add_docid_to_list(match):
462 """collect every document ID"""
463 remove_list.append(match["docid"])
464 self._walk_matches(query, add_docid_to_list)
465 return self.delete_doc(remove_list)
466
467 - def _walk_matches(self, query, function, arg_for_function=None):
468 """use this function if you want to do something with every single match
469 of a query
470
471 example::
472 self._walk_matches(query, function_for_match, arg_for_func)
473 'function_for_match' expects only one argument: the matched object
474
475 @param query: a query object of the real implementation
476 @type query: xapian.Query | PyLucene.Query
477 @param function: the function to execute with every match
478 @type function: function
479 @param arg_for_function: an optional argument for the function
480 @type arg_for_function: anything
481 """
482
483 enquire = self.get_query_result(query)
484
485 start = 0
486
487 size, avail = (0, 1)
488
489 steps = 2
490 while start < avail:
491 (size, avail, matches) = enquire.get_matches(start, steps)
492 for match in matches:
493 if arg_for_function is None:
494 function(match)
495 else:
496 function(match, arg_for_function)
497 start += size
498
500 """set the analyzers for different fields of the database documents
501
502 All bitwise combinations of CommonIndexer.ANALYZER_??? are possible.
503
504 @param field_analyzers: mapping of field names and analyzers
505 @type field_analyzers: dict containing field names and analyzers
506 @raise TypeError: invalid values in 'field_analyzers'
507 """
508 for field, analyzer in field_analyzers.items():
509
510 if not isinstance(field, (str, unicode)):
511 raise TypeError("field name must be a string")
512 if not isinstance(analyzer, int):
513 raise TypeError("the analyzer must be a whole number (int)")
514
515 self.field_analyzers[field] = analyzer
516
518 """return the analyzer that was mapped to a specific field
519
520 see 'set_field_analyzers' for details
521
522 @param fieldnames: the analyzer of this field (or all/multiple fields)
523 is requested; leave empty (or "None") to request all fields
524 @type fieldnames: str | list of str | None
525 @return: the analyzer setting of the field - see
526 CommonDatabase.ANALYZER_??? or a dict of field names and analyzers
527 @rtype: int | dict
528 """
529
530 if fieldnames is None:
531
532 return dict(self.field_analyzers)
533
534 if isinstance(fieldnames, (str, unicode)):
535 if self.field_analyzers.has_key(fieldnames):
536 return self.field_analyzers[fieldnames]
537 else:
538 return self.analyzer
539
540 if isinstance(fieldnames, list):
541 result = {}
542 for field in fieldnames:
543 result[field] = self.get_field_analyzers(field)
544 return result
545 return self.analyzer
546
548 """decode the string from utf-8 or charmap
549 perform unicde normalization
550 """
551 if isinstance(text, str):
552 try:
553 result = unicode(text.decode("UTF-8"))
554 except UnicodeEncodeError, e:
555 result = unicode(text.decode("charmap"))
556 elif not isinstance(text, unicode):
557 result = unicode(text)
558 else:
559 result = text
560
561 return translate.lang.data.normalize(result)
562
563
565 """an enquire object contains the information about the result of a request
566 """
567
569 """intialization of a wrapper around enquires of different backends
570
571 @param enquire: a previous enquire
572 @type enquire: xapian.Enquire | pylucene-enquire
573 """
574 self.enquire = enquire
575
577 """return a specified number of qualified matches of a previous query
578
579 @param start: index of the first match to return (starting from zero)
580 @type start: int
581 @param number: the number of matching entries to return
582 @type number: int
583 @return: a set of matching entries and some statistics
584 @rtype: tuple of (returned number, available number, matches)
585 "matches" is a dictionary of::
586 ["rank", "percent", "document", "docid"]
587 """
588 raise NotImplementedError("Incomplete indexing implementation: " \
589 + "'get_matches' for the 'Enquire' class is missing")
590
592 """return the estimated number of matches
593
594 use "CommonIndexer.search" to retrieve the exact number of matches
595 @return: the estimaed number of matches
596 @rtype: int
597 """
598 (returned, estimate_count, matches) = self.get_matches(0, 1)
599 return estimate_count
600