Package translate :: Package search :: Package indexing :: Module PyLuceneIndexer1
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.PyLuceneIndexer1

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2008 Zuza Software Foundation 
  4  #  
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  #  
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22   
 23  """ 
 24  interface for the pylucene (v1.x) indexing engine 
 25   
 26  take a look at PyLuceneIndexer.py for PyLucene v2.x support 
 27  """ 
 28   
 29  __revision__ = "$Id: PyLuceneIndexer1.py 8505 2008-09-27 09:08:34Z dwaynebailey $" 
 30   
 31  # this module is based on PyLuceneIndexer (for PyLucene v2.x) 
 32  import PyLuceneIndexer 
 33  import PyLucene 
 34   
 35   
36 -def is_available():
37 return PyLuceneIndexer._get_pylucene_version() == 1
38 39
40 -class PyLuceneDatabase(PyLuceneIndexer.PyLuceneDatabase):
41 """manage and use a pylucene indexing database""" 42
43 - def _create_query_for_string(self, text, require_all=True, 44 analyzer=None):
45 """generate a query for a plain term of a string query 46 47 basically this function parses the string and returns the resulting 48 query 49 50 @param text: the query string 51 @type text: str 52 @param require_all: boolean operator 53 (True -> AND (default) / False -> OR) 54 @type require_all: bool 55 @param analyzer: the analyzer to be used 56 possible analyzers are: 57 - L{CommonDatabase.ANALYZER_TOKENIZE} 58 the field value is splitted to be matched word-wise 59 - L{CommonDatabase.ANALYZER_PARTIAL} 60 the field value must start with the query string 61 - L{CommonDatabase.ANALYZER_EXACT} 62 keep special characters and the like 63 @type analyzer: bool 64 @return: resulting query object 65 @rtype: PyLucene.Query 66 """ 67 if analyzer is None: 68 analyzer = self.analyzer 69 if analyzer == self.ANALYZER_EXACT: 70 # exact matching - no substitution ... 71 # for PyLucene: nothing special is necessary 72 pass 73 # don't care about special characters ... 74 if analyzer == self.ANALYZER_EXACT: 75 analyzer_obj = self.ExactAnalyzer() 76 else: 77 text = _escape_term_value(text) 78 analyzer_obj = PyLucene.StandardAnalyzer() 79 qp = PyLucene.QueryParser(analyzer=analyzer_obj) 80 if require_all: 81 qp.setDefaultOperator(qp.Operator.AND) 82 else: 83 qp.setDefaultOperator(qp.Operator.OR) 84 if (analyzer & self.ANALYZER_PARTIAL) > 0: 85 # PyLucene uses explicit wildcards for partial matching 86 text += "*" 87 return qp.parse(text)
88
89 - def _create_query_for_field(self, field, value, analyzer=None):
90 """generate a field query 91 92 this functions creates a field->value query 93 94 @param field: the fieldname to be used 95 @type field: str 96 @param value: the wanted value of the field 97 @type value: str 98 @param analyzer: the analyzer to be used 99 possible analyzers are: 100 - L{CommonDatabase.ANALYZER_TOKENIZE} 101 the field value is splitted to be matched word-wise 102 - L{CommonDatabase.ANALYZER_PARTIAL} 103 the field value must start with the query string 104 - L{CommonDatabase.ANALYZER_EXACT} 105 keep special characters and the like 106 @type analyzer: bool 107 @return: resulting query object 108 @rtype: PyLucene.Query 109 """ 110 if analyzer is None: 111 analyzer = self.analyzer 112 if analyzer == self.ANALYZER_EXACT: 113 analyzer_obj = self.ExactAnalyzer() 114 else: 115 value = _escape_term_value(value) 116 analyzer_obj = PyLucene.StandardAnalyzer() 117 if (analyzer & self.ANALYZER_PARTIAL) > 0: 118 # PyLucene uses explicit wildcards for partial matching 119 value += "*" 120 return PyLucene.QueryParser.parse(value, field, analyzer_obj)
121
122 - def _create_query_combined(self, queries, require_all=True):
123 """generate a combined query 124 125 @param queries: list of the original queries 126 @type queries: list of xapian.Query 127 @param require_all: boolean operator 128 (True -> AND (default) / False -> OR) 129 @type require_all: bool 130 @return: the resulting combined query object 131 @rtype: PyLucene.Query 132 """ 133 combined_query = PyLucene.BooleanQuery() 134 for query in queries: 135 combined_query.add( 136 PyLucene.BooleanClause(query, require_all, False)) 137 return combined_query
138
139 - def _add_plain_term(self, document, term, tokenize=True):
140 """add a term to a document 141 142 @param document: the document to be changed 143 @type document: xapian.Document | PyLucene.Document 144 @param term: a single term to be added 145 @type term: str 146 @param tokenize: should the term be tokenized automatically 147 @type tokenize: bool 148 """ 149 # Field parameters: name, string, store, index, token 150 document.add(PyLucene.Field(str(PyLuceneIndex.UNNAMED_FIELD_NAME), term, 151 True, True, tokenize))
152
153 - def _add_field_term(self, document, field, term, tokenize=True):
154 """add a field term to a document 155 156 @param document: the document to be changed 157 @type document: xapian.Document | PyLucene.Document 158 @param field: name of the field 159 @type field: str 160 @param term: term to be associated to the field 161 @type term: str 162 @param tokenize: should the term be tokenized automatically 163 @type tokenize: bool 164 """ 165 # TODO: decoding (utf-8) is missing 166 # Field parameters: name, string, store, index, token 167 document.add(PyLucene.Field(str(field), term, 168 True, True, tokenize))
169
170 - def get_query_result(self, query):
171 """return an object containing the results of a query 172 173 @param query: a pre-compiled query 174 @type query: a query object of the real implementation 175 @return: an object that allows access to the results 176 @rtype: subclass of CommonEnquire 177 """ 178 return PyLucene.indexSearcher.search(query)
179
180 - def search(self, query, fieldnames):
181 """return a list of the contents of specified fields for all matches of 182 a query 183 184 @param query: the query to be issued 185 @type query: a query object of the real implementation 186 @param fieldnames: the name(s) of a field of the document content 187 @type fieldnames: string | list of strings 188 @return: a list of dicts containing the specified field(s) 189 @rtype: list of dicts 190 """ 191 if isinstance(fieldnames, basestring): 192 fieldnames = [fieldnames] 193 hits = PyLucene.indexSearcher.search(query) 194 result = [] 195 for hit, doc in hits: 196 fields = {} 197 for fieldname in fieldnames: 198 content = doc.get(fieldname) 199 if not content is None: 200 fields[fieldname] = content 201 result.append(fields) 202 return result
203
204 - def _writer_open(self):
205 """open write access for the indexing database and acquire an 206 exclusive lock 207 """ 208 super(PyLuceneIndexer1, self)._writer_open_() 209 self.writer.maxFieldLength = PyLuceneIndexer.MAX_FIELD_SIZE
210