Skip to content

Engine

analyzers

lupyne.engine.analyzers.TokenStream

Bases: TokenStream

TokenStream mixin with support for iteration and attributes cached as properties.

Source code in lupyne/engine/analyzers.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class TokenStream(analysis.TokenStream):
    """TokenStream mixin with support for iteration and attributes cached as properties."""

    def __iter__(self):
        self.reset()
        return self

    def __next__(self):
        if self.incrementToken():
            return self
        raise StopIteration

    def __getattr__(self, name):
        cls = getattr(analysis.tokenattributes, name + 'Attribute').class_
        attr = self.getAttribute(cls) if self.hasAttribute(cls) else self.addAttribute(cls)
        setattr(self, name, attr)
        return attr

    @property
    def offset(self) -> tuple:
        """start and stop character offset"""
        return self.Offset.startOffset(), self.Offset.endOffset()

    @offset.setter
    def offset(self, item: Iterable):
        self.Offset.setOffset(*item)

    @property
    def payload(self):
        """payload bytes"""
        payload = self.Payload.payload
        return payload and payload.utf8ToString()

    @payload.setter
    def payload(self, data):
        self.Payload.payload = util.BytesRef(data)

    @property
    def positionIncrement(self) -> int:
        """position relative to the previous token"""
        return self.PositionIncrement.positionIncrement

    @positionIncrement.setter
    def positionIncrement(self, index: int):
        self.PositionIncrement.positionIncrement = index

    @property
    def charTerm(self) -> str:
        """term text"""
        return str(self.CharTerm)

    @charTerm.setter
    def charTerm(self, text: str):
        self.CharTerm.setEmpty()
        self.CharTerm.append(text)

    @property
    def type(self) -> str:
        """lexical type"""
        return self.Type.type()

    @type.setter
    def type(self, text: str):
        self.Type.setType(text)

charTerm: str property writable

term text

offset: tuple property writable

start and stop character offset

payload property writable

payload bytes

positionIncrement: int property writable

position relative to the previous token

type: str property writable

lexical type

lupyne.engine.analyzers.TokenFilter

Bases: PythonTokenFilter, TokenStream

Create an iterable lucene TokenFilter from a TokenStream.

Subclass and override incrementToken.

Source code in lupyne/engine/analyzers.py
78
79
80
81
82
83
84
85
86
87
88
89
90
class TokenFilter(PythonTokenFilter, TokenStream):
    """Create an iterable lucene TokenFilter from a TokenStream.

    Subclass and override [incrementToken][lupyne.engine.analyzers.TokenFilter.incrementToken].
    """

    def __init__(self, input: analysis.TokenStream):
        super().__init__(input)
        self.input = input

    def incrementToken(self) -> bool:
        """Advance to next token and return whether the stream is not empty."""
        return self.input.incrementToken()

incrementToken()

Advance to next token and return whether the stream is not empty.

Source code in lupyne/engine/analyzers.py
88
89
90
def incrementToken(self) -> bool:
    """Advance to next token and return whether the stream is not empty."""
    return self.input.incrementToken()

lupyne.engine.analyzers.Analyzer

Bases: PythonAnalyzer

Return a lucene Analyzer which chains together a tokenizer and filters.

Parameters:

Name Type Description Default
tokenizer Callable

lucene Tokenizer class or callable, called with no args

required
*filters Callable

lucene TokenFilter classes or callables, successively called on input tokens

()
Source code in lupyne/engine/analyzers.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class Analyzer(PythonAnalyzer):
    """Return a lucene Analyzer which chains together a tokenizer and filters.

    Args:
        tokenizer: lucene Tokenizer class or callable, called with no args
        *filters: lucene TokenFilter classes or callables, successively called on input tokens
    """

    def __init__(self, tokenizer: Callable, *filters: Callable):
        super().__init__()
        self.tokenizer, self.filters = tokenizer, filters

    @classmethod
    def standard(cls, *filters: Callable) -> 'Analyzer':
        """Return equivalent of StandardAnalyzer with additional filters."""
        return cls(analysis.standard.StandardTokenizer, analysis.LowerCaseFilter, *filters)

    @classmethod
    def whitespace(cls, *filters: Callable) -> 'Analyzer':
        """Return equivalent of WhitespaceAnalyzer with additional filters."""
        return cls(analysis.core.WhitespaceTokenizer, *filters)

    def components(self, field, reader=None):
        source = tokens = self.tokenizer()
        if reader is not None:
            source.reader = reader
        for filter in self.filters:
            tokens = filter(tokens)
        return source, tokens

    def createComponents(self, field):
        return analysis.Analyzer.TokenStreamComponents(*self.components(field))

    def tokens(self, text: str, field: Optional[str] = None) -> analysis.TokenStream:
        """Return lucene TokenStream from text."""
        return self.components(field, StringReader(text))[1]

    def parse(self, query: str, field='', op='', parser=None, **attrs) -> search.Query:
        """Return parsed lucene Query.

        Args:
            query: query string
            field: default query field name, sequence of names, or boost mapping
            op: default query operator ('or', 'and')
            parser: custom PythonQueryParser class
            **attrs: additional attributes to set on the parser
        """
        # parsers aren't thread-safe (nor slow), so create one each time
        cls = queryparser.classic.MultiFieldQueryParser
        if isinstance(field, str):
            cls = queryparser.classic.QueryParser
        args: tuple = field, self
        if isinstance(field, Mapping):
            boosts = HashMap()
            for key in field:
                boosts.put(key, Float(field[key]))
            args = list(field), self, boosts
        parser = (parser or cls)(*args)
        if op:
            parser.defaultOperator = getattr(queryparser.classic.QueryParser.Operator, op.upper())
        for name, value in attrs.items():
            setattr(parser, name, value)
        if isinstance(parser, queryparser.classic.MultiFieldQueryParser):
            return parser.parse(parser, query)
        return parser.parse(query)

    def highlight(self, query: search.Query, field: str, content: str, count: int = 1) -> str:
        """Return highlighted content.

        Args:
            query: lucene Query
            field: field name
            content: text
            count: optional maximum number of passages
        """
        highlighter = uhighlight.UnifiedHighlighter(None, self)
        return str(highlighter.highlightWithoutSearcher(field, query, content, count))

highlight(query, field, content, count=1)

Return highlighted content.

Parameters:

Name Type Description Default
query Query

lucene Query

required
field str

field name

required
content str

text

required
count int

optional maximum number of passages

1
Source code in lupyne/engine/analyzers.py
159
160
161
162
163
164
165
166
167
168
169
def highlight(self, query: search.Query, field: str, content: str, count: int = 1) -> str:
    """Return highlighted content.

    Args:
        query: lucene Query
        field: field name
        content: text
        count: optional maximum number of passages
    """
    highlighter = uhighlight.UnifiedHighlighter(None, self)
    return str(highlighter.highlightWithoutSearcher(field, query, content, count))

parse(query, field='', op='', parser=None, **attrs)

Return parsed lucene Query.

Parameters:

Name Type Description Default
query str

query string

required
field

default query field name, sequence of names, or boost mapping

''
op

default query operator ('or', 'and')

''
parser

custom PythonQueryParser class

None
**attrs

additional attributes to set on the parser

{}
Source code in lupyne/engine/analyzers.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def parse(self, query: str, field='', op='', parser=None, **attrs) -> search.Query:
    """Return parsed lucene Query.

    Args:
        query: query string
        field: default query field name, sequence of names, or boost mapping
        op: default query operator ('or', 'and')
        parser: custom PythonQueryParser class
        **attrs: additional attributes to set on the parser
    """
    # parsers aren't thread-safe (nor slow), so create one each time
    cls = queryparser.classic.MultiFieldQueryParser
    if isinstance(field, str):
        cls = queryparser.classic.QueryParser
    args: tuple = field, self
    if isinstance(field, Mapping):
        boosts = HashMap()
        for key in field:
            boosts.put(key, Float(field[key]))
        args = list(field), self, boosts
    parser = (parser or cls)(*args)
    if op:
        parser.defaultOperator = getattr(queryparser.classic.QueryParser.Operator, op.upper())
    for name, value in attrs.items():
        setattr(parser, name, value)
    if isinstance(parser, queryparser.classic.MultiFieldQueryParser):
        return parser.parse(parser, query)
    return parser.parse(query)

standard(*filters) classmethod

Return equivalent of StandardAnalyzer with additional filters.

Source code in lupyne/engine/analyzers.py
105
106
107
108
@classmethod
def standard(cls, *filters: Callable) -> 'Analyzer':
    """Return equivalent of StandardAnalyzer with additional filters."""
    return cls(analysis.standard.StandardTokenizer, analysis.LowerCaseFilter, *filters)

tokens(text, field=None)

Return lucene TokenStream from text.

Source code in lupyne/engine/analyzers.py
126
127
128
def tokens(self, text: str, field: Optional[str] = None) -> analysis.TokenStream:
    """Return lucene TokenStream from text."""
    return self.components(field, StringReader(text))[1]

whitespace(*filters) classmethod

Return equivalent of WhitespaceAnalyzer with additional filters.

Source code in lupyne/engine/analyzers.py
110
111
112
113
@classmethod
def whitespace(cls, *filters: Callable) -> 'Analyzer':
    """Return equivalent of WhitespaceAnalyzer with additional filters."""
    return cls(analysis.core.WhitespaceTokenizer, *filters)

indexers

lupyne.engine.indexers.IndexReader

Delegated lucene IndexReader, with a mapping interface of ids to document objects.

Parameters:

Name Type Description Default
reader

lucene IndexReader

required
Source code in lupyne/engine/indexers.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
class IndexReader:
    """Delegated lucene IndexReader, with a mapping interface of ids to document objects.

    Args:
        reader: lucene IndexReader
    """

    def __init__(self, reader):
        self.indexReader = reader

    def __getattr__(self, name):
        if name == 'indexReader':
            raise AttributeError(name)
        return getattr(index.DirectoryReader.cast_(self.indexReader), name)

    def __len__(self):
        return self.numDocs()

    def __contains__(self, id: int):
        bits = self.bits
        return (0 <= id < self.maxDoc()) and (not bits or bits.get(id))

    def __iter__(self) -> Iterator[int]:
        ids = range(self.maxDoc())
        bits = self.bits
        return filter(bits.get, ids) if bits else iter(ids)

    @property
    def bits(self) -> util.Bits:
        return index.MultiBits.getLiveDocs(self.indexReader)

    @property
    def directory(self) -> store.Directory:
        """reader's lucene Directory"""
        return self.__getattr__('directory')()

    @property
    def path(self) -> str:
        """FSDirectory path"""
        return str(store.FSDirectory.cast_(self.directory).directory)

    @property
    def timestamp(self) -> float:
        """timestamp of reader's last commit"""
        return File(self.path, self.indexCommit.segmentsFileName).lastModified() * 0.001

    @property
    def readers(self) -> Iterator:
        """segment readers"""
        return (index.SegmentReader.cast_(context.reader()) for context in self.leaves())

    @property
    def segments(self) -> dict:
        """segment filenames with document counts"""
        return {reader.segmentName: reader.numDocs() for reader in self.readers}

    @property
    def fieldinfos(self) -> dict:
        """mapping of field names to lucene FieldInfos"""
        fieldinfos = index.FieldInfos.getMergedFieldInfos(self.indexReader)
        return {fieldinfo.name: fieldinfo for fieldinfo in fieldinfos.iterator()}

    def dictionary(self, name: str, *args) -> spell.Dictionary:
        """Return lucene Dictionary, suitable for spellcheckers."""
        cls = spell.HighFrequencyDictionary if args else spell.LuceneDictionary
        return cls(self.indexReader, name, *args)

    def suggest(self, name: str, value, count: int = 1, **attrs) -> list:
        """Return spelling suggestions from DirectSpellChecker.

        Args:
            name: field name
            value: term
            count: maximum number of suggestions
            **attrs: DirectSpellChecker options
        """
        checker = spell.DirectSpellChecker()
        for attr in attrs:
            setattr(checker, attr, attrs[attr])
        words = checker.suggestSimilar(index.Term(name, value), count, self.indexReader)
        return [word.string for word in words]

    def complete(self, name: str, prefix: str, count: int) -> list[str]:
        """Return autocomplete suggestions for word prefix."""
        terms = dict(self.terms(name, prefix, counts=True))
        return heapq.nlargest(count, terms, key=terms.__getitem__)

    def sortfield(self, name: str, type=None, reverse=False) -> search.SortField:
        """Return lucene SortField, deriving the the type from FieldInfos if necessary.

        Args:
            name: field name
            type: int, float, or name compatible with SortField constants
            reverse: reverse flag used with sort
        """
        if type is None:
            type = str(self.fieldinfos[name].docValuesType)
        type = Field.types.get(type, type).upper()
        return search.SortField(name, getattr(search.SortField.Type, type), reverse)

    def docvalues(self, name: str, type=None) -> DocValues.Sorted:
        """Return chained lucene DocValues, suitable for custom sorting or grouping.

        Note multi-valued DocValues aren't thread-safe and only supported ordered iteration.

        Args:
            name: field name
            type: int or float for converting values
        """
        types = {int: int, float: util.NumericUtils.sortableLongToDouble}
        type = types.get(type, util.BytesRef.utf8ToString)
        docValuesType = str(self.fieldinfos[name].docValuesType).title().replace('_', '')
        method = getattr(index.MultiDocValues, f'get{docValuesType}Values')
        return getattr(DocValues, docValuesType)(method(self.indexReader, name), len(self), type)

    def copy(
        self, dest, query: search.Query = None, exclude: search.Query = None, merge: int = 0
    ) -> int:
        """Copy the index to the destination directory.

        Optimized to use hard links if the destination is a file system path.

        Args:
            dest: destination directory path or lucene Directory
            query: optional lucene Query to select documents
            exclude: optional lucene Query to exclude documents
            merge: optionally merge into maximum number of segments
        """
        copy(self.indexCommit, dest)
        with IndexWriter(dest) as writer:
            if query:
                writer.delete(Query.alldocs() - query)
            if exclude:
                writer.delete(exclude)
            writer.commit()
            writer.forceMergeDeletes()
            if merge:
                writer.forceMerge(merge)
            return len(writer)

    def terms(self, name: str, value='', stop='', counts=False, distance=0, prefix=0) -> Iterator:
        """Generate a slice of term values, optionally with frequency counts.

        Args:
            name: field name
            value: term prefix, start value (given stop), or fuzzy value (given distance)
            stop: optional upper bound for range terms
            counts: include frequency counts
            distance: maximum edit distance for fuzzy terms
            prefix: prefix length for fuzzy terms
        """
        terms = index.MultiTerms.getTerms(self.indexReader, name)
        if not terms:
            return iter([])
        term, termsenum = index.Term(name, value), terms.iterator()
        if distance:
            terms = termsenum = search.FuzzyTermsEnum(terms, term, distance, prefix, False)
        else:
            termsenum.seekCeil(util.BytesRef(value))
            terms = itertools.chain([termsenum.term()], util.BytesRefIterator.cast_(termsenum))
        terms = map(operator.methodcaller('utf8ToString'), terms)
        predicate = (
            partial(operator.gt, stop) if stop else operator.methodcaller('startswith', value)
        )
        if not distance:
            terms = itertools.takewhile(predicate, terms)  # type: ignore
        return ((term, termsenum.docFreq()) for term in terms) if counts else terms

    def docs(self, name: str, value, counts=False) -> Iterator:
        """Generate doc ids which contain given term, optionally with frequency counts."""
        docsenum = index.MultiTerms.getTermPostingsEnum(
            self.indexReader, name, util.BytesRef(value)
        )
        docs = iter(docsenum.nextDoc, index.PostingsEnum.NO_MORE_DOCS) if docsenum else ()
        return ((doc, docsenum.freq()) for doc in docs) if counts else iter(docs)  # type: ignore

    def positions(self, name: str, value, payloads=False, offsets=False) -> Iterator[tuple]:
        """Generate doc ids and positions which contain given term.

        Optionally with offsets, or only ones with payloads."""
        docsenum = index.MultiTerms.getTermPostingsEnum(
            self.indexReader, name, util.BytesRef(value)
        )
        for doc in iter(docsenum.nextDoc, index.PostingsEnum.NO_MORE_DOCS) if docsenum else ():
            positions = (docsenum.nextPosition() for _ in range(docsenum.freq()))
            if payloads:
                positions = (
                    (position, docsenum.payload.utf8ToString())
                    for position in positions
                    if docsenum.payload
                )
            elif offsets:
                positions = (
                    (docsenum.startOffset(), docsenum.endOffset()) for position in positions
                )
            yield doc, list(positions)

    def vector(self, id, field):
        terms = self.termVectors().get(id, field)
        termsenum = terms.iterator() if terms else index.TermsEnum.EMPTY
        terms = map(operator.methodcaller('utf8ToString'), util.BytesRefIterator.cast_(termsenum))
        return termsenum, terms

    def termvector(self, id: int, field: str, counts=False) -> Iterator:
        """Generate terms for given doc id and field, optionally with frequency counts."""
        termsenum, terms = self.vector(id, field)
        return ((term, termsenum.totalTermFreq()) for term in terms) if counts else terms

    def positionvector(self, id: int, field: str, offsets=False) -> Iterator[tuple]:
        """Generate terms and positions for given doc id and field, optionally with character offsets."""
        termsenum, terms = self.vector(id, field)
        for term in terms:
            docsenum = termsenum.postings(None)
            assert 0 <= docsenum.nextDoc() < docsenum.NO_MORE_DOCS
            positions = (docsenum.nextPosition() for _ in range(docsenum.freq()))
            if offsets:
                positions = ((docsenum.startOffset(), docsenum.endOffset()) for _ in positions)
            yield term, list(positions)

    def morelikethis(self, doc, *fields, **attrs) -> Query:
        """Return MoreLikeThis query for document.

        Args:
            doc: document id or text
            *fields: document fields to use, optional for termvectors
            **attrs: additional attributes to set on the morelikethis object
        """
        mlt = queries.mlt.MoreLikeThis(self.indexReader)
        mlt.fieldNames = fields or None
        for name, value in attrs.items():
            setattr(mlt, name, value)
        return mlt.like(fields[0], StringReader(doc)) if isinstance(doc, str) else mlt.like(doc)

directory: store.Directory property

reader's lucene Directory

fieldinfos: dict property

mapping of field names to lucene FieldInfos

path: str property

FSDirectory path

readers: Iterator property

segment readers

segments: dict property

segment filenames with document counts

timestamp: float property

timestamp of reader's last commit

complete(name, prefix, count)

Return autocomplete suggestions for word prefix.

Source code in lupyne/engine/indexers.py
160
161
162
163
def complete(self, name: str, prefix: str, count: int) -> list[str]:
    """Return autocomplete suggestions for word prefix."""
    terms = dict(self.terms(name, prefix, counts=True))
    return heapq.nlargest(count, terms, key=terms.__getitem__)

copy(dest, query=None, exclude=None, merge=0)

Copy the index to the destination directory.

Optimized to use hard links if the destination is a file system path.

Parameters:

Name Type Description Default
dest

destination directory path or lucene Directory

required
query Query

optional lucene Query to select documents

None
exclude Query

optional lucene Query to exclude documents

None
merge int

optionally merge into maximum number of segments

0
Source code in lupyne/engine/indexers.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def copy(
    self, dest, query: search.Query = None, exclude: search.Query = None, merge: int = 0
) -> int:
    """Copy the index to the destination directory.

    Optimized to use hard links if the destination is a file system path.

    Args:
        dest: destination directory path or lucene Directory
        query: optional lucene Query to select documents
        exclude: optional lucene Query to exclude documents
        merge: optionally merge into maximum number of segments
    """
    copy(self.indexCommit, dest)
    with IndexWriter(dest) as writer:
        if query:
            writer.delete(Query.alldocs() - query)
        if exclude:
            writer.delete(exclude)
        writer.commit()
        writer.forceMergeDeletes()
        if merge:
            writer.forceMerge(merge)
        return len(writer)

dictionary(name, *args)

Return lucene Dictionary, suitable for spellcheckers.

Source code in lupyne/engine/indexers.py
140
141
142
143
def dictionary(self, name: str, *args) -> spell.Dictionary:
    """Return lucene Dictionary, suitable for spellcheckers."""
    cls = spell.HighFrequencyDictionary if args else spell.LuceneDictionary
    return cls(self.indexReader, name, *args)

docs(name, value, counts=False)

Generate doc ids which contain given term, optionally with frequency counts.

Source code in lupyne/engine/indexers.py
246
247
248
249
250
251
252
def docs(self, name: str, value, counts=False) -> Iterator:
    """Generate doc ids which contain given term, optionally with frequency counts."""
    docsenum = index.MultiTerms.getTermPostingsEnum(
        self.indexReader, name, util.BytesRef(value)
    )
    docs = iter(docsenum.nextDoc, index.PostingsEnum.NO_MORE_DOCS) if docsenum else ()
    return ((doc, docsenum.freq()) for doc in docs) if counts else iter(docs)  # type: ignore

docvalues(name, type=None)

Return chained lucene DocValues, suitable for custom sorting or grouping.

Note multi-valued DocValues aren't thread-safe and only supported ordered iteration.

Parameters:

Name Type Description Default
name str

field name

required
type

int or float for converting values

None
Source code in lupyne/engine/indexers.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def docvalues(self, name: str, type=None) -> DocValues.Sorted:
    """Return chained lucene DocValues, suitable for custom sorting or grouping.

    Note multi-valued DocValues aren't thread-safe and only supported ordered iteration.

    Args:
        name: field name
        type: int or float for converting values
    """
    types = {int: int, float: util.NumericUtils.sortableLongToDouble}
    type = types.get(type, util.BytesRef.utf8ToString)
    docValuesType = str(self.fieldinfos[name].docValuesType).title().replace('_', '')
    method = getattr(index.MultiDocValues, f'get{docValuesType}Values')
    return getattr(DocValues, docValuesType)(method(self.indexReader, name), len(self), type)

morelikethis(doc, *fields, **attrs)

Return MoreLikeThis query for document.

Parameters:

Name Type Description Default
doc

document id or text

required
*fields

document fields to use, optional for termvectors

()
**attrs

additional attributes to set on the morelikethis object

{}
Source code in lupyne/engine/indexers.py
297
298
299
300
301
302
303
304
305
306
307
308
309
def morelikethis(self, doc, *fields, **attrs) -> Query:
    """Return MoreLikeThis query for document.

    Args:
        doc: document id or text
        *fields: document fields to use, optional for termvectors
        **attrs: additional attributes to set on the morelikethis object
    """
    mlt = queries.mlt.MoreLikeThis(self.indexReader)
    mlt.fieldNames = fields or None
    for name, value in attrs.items():
        setattr(mlt, name, value)
    return mlt.like(fields[0], StringReader(doc)) if isinstance(doc, str) else mlt.like(doc)

positions(name, value, payloads=False, offsets=False)

Generate doc ids and positions which contain given term.

Optionally with offsets, or only ones with payloads.

Source code in lupyne/engine/indexers.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def positions(self, name: str, value, payloads=False, offsets=False) -> Iterator[tuple]:
    """Generate doc ids and positions which contain given term.

    Optionally with offsets, or only ones with payloads."""
    docsenum = index.MultiTerms.getTermPostingsEnum(
        self.indexReader, name, util.BytesRef(value)
    )
    for doc in iter(docsenum.nextDoc, index.PostingsEnum.NO_MORE_DOCS) if docsenum else ():
        positions = (docsenum.nextPosition() for _ in range(docsenum.freq()))
        if payloads:
            positions = (
                (position, docsenum.payload.utf8ToString())
                for position in positions
                if docsenum.payload
            )
        elif offsets:
            positions = (
                (docsenum.startOffset(), docsenum.endOffset()) for position in positions
            )
        yield doc, list(positions)

positionvector(id, field, offsets=False)

Generate terms and positions for given doc id and field, optionally with character offsets.

Source code in lupyne/engine/indexers.py
286
287
288
289
290
291
292
293
294
295
def positionvector(self, id: int, field: str, offsets=False) -> Iterator[tuple]:
    """Generate terms and positions for given doc id and field, optionally with character offsets."""
    termsenum, terms = self.vector(id, field)
    for term in terms:
        docsenum = termsenum.postings(None)
        assert 0 <= docsenum.nextDoc() < docsenum.NO_MORE_DOCS
        positions = (docsenum.nextPosition() for _ in range(docsenum.freq()))
        if offsets:
            positions = ((docsenum.startOffset(), docsenum.endOffset()) for _ in positions)
        yield term, list(positions)

sortfield(name, type=None, reverse=False)

Return lucene SortField, deriving the the type from FieldInfos if necessary.

Parameters:

Name Type Description Default
name str

field name

required
type

int, float, or name compatible with SortField constants

None
reverse

reverse flag used with sort

False
Source code in lupyne/engine/indexers.py
165
166
167
168
169
170
171
172
173
174
175
176
def sortfield(self, name: str, type=None, reverse=False) -> search.SortField:
    """Return lucene SortField, deriving the the type from FieldInfos if necessary.

    Args:
        name: field name
        type: int, float, or name compatible with SortField constants
        reverse: reverse flag used with sort
    """
    if type is None:
        type = str(self.fieldinfos[name].docValuesType)
    type = Field.types.get(type, type).upper()
    return search.SortField(name, getattr(search.SortField.Type, type), reverse)

suggest(name, value, count=1, **attrs)

Return spelling suggestions from DirectSpellChecker.

Parameters:

Name Type Description Default
name str

field name

required
value

term

required
count int

maximum number of suggestions

1
**attrs

DirectSpellChecker options

{}
Source code in lupyne/engine/indexers.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def suggest(self, name: str, value, count: int = 1, **attrs) -> list:
    """Return spelling suggestions from DirectSpellChecker.

    Args:
        name: field name
        value: term
        count: maximum number of suggestions
        **attrs: DirectSpellChecker options
    """
    checker = spell.DirectSpellChecker()
    for attr in attrs:
        setattr(checker, attr, attrs[attr])
    words = checker.suggestSimilar(index.Term(name, value), count, self.indexReader)
    return [word.string for word in words]

terms(name, value='', stop='', counts=False, distance=0, prefix=0)

Generate a slice of term values, optionally with frequency counts.

Parameters:

Name Type Description Default
name str

field name

required
value

term prefix, start value (given stop), or fuzzy value (given distance)

''
stop

optional upper bound for range terms

''
counts

include frequency counts

False
distance

maximum edit distance for fuzzy terms

0
prefix

prefix length for fuzzy terms

0
Source code in lupyne/engine/indexers.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def terms(self, name: str, value='', stop='', counts=False, distance=0, prefix=0) -> Iterator:
    """Generate a slice of term values, optionally with frequency counts.

    Args:
        name: field name
        value: term prefix, start value (given stop), or fuzzy value (given distance)
        stop: optional upper bound for range terms
        counts: include frequency counts
        distance: maximum edit distance for fuzzy terms
        prefix: prefix length for fuzzy terms
    """
    terms = index.MultiTerms.getTerms(self.indexReader, name)
    if not terms:
        return iter([])
    term, termsenum = index.Term(name, value), terms.iterator()
    if distance:
        terms = termsenum = search.FuzzyTermsEnum(terms, term, distance, prefix, False)
    else:
        termsenum.seekCeil(util.BytesRef(value))
        terms = itertools.chain([termsenum.term()], util.BytesRefIterator.cast_(termsenum))
    terms = map(operator.methodcaller('utf8ToString'), terms)
    predicate = (
        partial(operator.gt, stop) if stop else operator.methodcaller('startswith', value)
    )
    if not distance:
        terms = itertools.takewhile(predicate, terms)  # type: ignore
    return ((term, termsenum.docFreq()) for term in terms) if counts else terms

termvector(id, field, counts=False)

Generate terms for given doc id and field, optionally with frequency counts.

Source code in lupyne/engine/indexers.py
281
282
283
284
def termvector(self, id: int, field: str, counts=False) -> Iterator:
    """Generate terms for given doc id and field, optionally with frequency counts."""
    termsenum, terms = self.vector(id, field)
    return ((term, termsenum.totalTermFreq()) for term in terms) if counts else terms

lupyne.engine.indexers.IndexSearcher

Bases: IndexSearcher, IndexReader

Inherited lucene IndexSearcher, with a mixed-in IndexReader.

Parameters:

Name Type Description Default
directory

directory path, lucene Directory, or lucene IndexReader

required
analyzer

lucene Analyzer, default StandardAnalyzer

None
Source code in lupyne/engine/indexers.py
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
class IndexSearcher(search.IndexSearcher, IndexReader):
    """Inherited lucene IndexSearcher, with a mixed-in IndexReader.

    Args:
        directory: directory path, lucene Directory, or lucene IndexReader
        analyzer: lucene Analyzer, default StandardAnalyzer
    """

    def __init__(self, directory, analyzer=None):
        self.shared = closing()
        super().__init__(self.shared.reader(directory))
        self.analyzer = self.shared.analyzer(analyzer)

    def __del__(self):
        if hash(self):  # pragma: no branch
            self.decRef()

    def openIfChanged(self):
        return index.DirectoryReader.openIfChanged(index.DirectoryReader.cast_(self.indexReader))

    def reopen(self) -> 'IndexSearcher':
        """Return current [IndexSearcher][lupyne.engine.indexers.IndexSearcher].

        Only creates a new one if necessary.
        """
        reader = self.openIfChanged()
        if reader is None:
            return self
        other = type(self)(reader, self.analyzer)
        other.decRef()
        other.shared = self.shared
        return other

    def __getitem__(self, id: int) -> Document:
        return Document(self.storedFields().document(id))

    def get(self, id: int, *fields: str) -> Document:
        """Return [Document][lupyne.engine.documents.Document] with only selected fields loaded."""
        return Document(self.storedFields().document(id, HashSet(Arrays.asList(fields))))

    def spans(self, query: spans.SpanQuery, positions=False) -> Iterator[tuple]:
        """Generate docs with occurrence counts for a span query.

        Args:
            query: lucene SpanQuery
            positions: optionally include slice positions instead of counts
        """
        offset = 0
        weight = query.createWeight(self, search.ScoreMode.COMPLETE_NO_SCORES, 1.0)
        postings = queries.spans.SpanWeight.Postings.POSITIONS
        for reader in self.readers:
            try:
                spans = weight.getSpans(reader.context, postings)
            except lucene.JavaError:  # EOF
                continue
            for doc in iter(spans.nextDoc, spans.NO_MORE_DOCS):
                starts = iter(spans.nextStartPosition, spans.NO_MORE_POSITIONS)
                if positions:
                    values = [(start, spans.endPosition()) for start in starts]
                else:
                    values = sum(1 for _ in starts)  # type: ignore
                yield (doc + offset), values
            offset += reader.maxDoc()

    def parse(self, query, spellcheck=False, **kwargs) -> search.Query:
        if isinstance(query, search.Query):
            return query
        if spellcheck:
            kwargs['parser'], kwargs['searcher'] = SpellParser, self
        return Analyzer.parse(self.analyzer, query, **kwargs)

    @property
    def highlighter(self) -> uhighlight.UnifiedHighlighter:
        """lucene UnifiedHighlighter"""
        return uhighlight.UnifiedHighlighter(self, self.analyzer)

    def count(self, *query, **options) -> int:
        """Return number of hits for given query or term.

        Args:
            *query: [search][lupyne.engine.indexers.IndexSearcher.search] compatible query, or optimally a name and value
            **options: additional [search][lupyne.engine.indexers.IndexSearcher.search] options
        """
        if len(query) > 1:
            return self.docFreq(index.Term(*query))
        return super().count(self.parse(*query, **options) if query else Query.alldocs())

    def collector(self, count=None, sort=None, reverse=False, scores=False, mincount=1000):
        if count is None:
            return search.CachingCollector.create(True, float('inf'))
        count = min(count, self.maxDoc() or 1)
        mincount = max(count, mincount)
        if sort is None:
            return search.TopScoreDocCollectorManager(count, mincount).newCollector()
        if isinstance(sort, str):
            sort = self.sortfield(sort, reverse=reverse)
        if not isinstance(sort, search.Sort):
            sort = search.Sort(sort)
        return search.TopFieldCollectorManager(sort, count, mincount).newCollector()

    def search(
        self,
        query=None,
        count=None,
        sort=None,
        reverse=False,
        scores=False,
        mincount=1000,
        **parser,
    ) -> Hits:
        """Run query and return [Hits][lupyne.engine.documents.Hits].

        Note:
            changed in version 2.3: maxscore option removed; use Hits.maxscore property

        Args:
            query: query string or lucene Query
            count: maximum number of hits to retrieve
            sort: lucene Sort parameters
            reverse: reverse flag used with sort
            scores: compute scores for candidate results when sorting
            mincount: total hit count accuracy threshold
            **parser: [parse][lupyne.engine.analyzers.Analyzer.parse]` options
        """
        query = Query.alldocs() if query is None else self.parse(query, **parser)
        results = cache = collector = self.collector(count, sort, reverse, scores, mincount)
        super().search(query, results)
        if isinstance(cache, search.CachingCollector):
            collector = search.TotalHitCountCollector()
            cache.replay(collector)
            count = collector.totalHits or 1
            collector = self.collector(count, sort, reverse, scores, count)
            cache.replay(collector)
        topdocs = collector.topDocs()
        if scores:
            search.TopFieldCollector.populateScores(topdocs.scoreDocs, self, query)
        return Hits(self, topdocs.scoreDocs, topdocs.totalHits)

    def facets(self, query, *fields: str, **query_map: dict) -> dict:
        """Return mapping of document counts for the intersection with each facet.

        Args:
            query: query string or lucene Query
            *fields: field names for lucene GroupingSearch
            **query_map: `{facet: {key: query, ...}, ...}` for intersected query counts
        """
        query = self.parse(query)
        counts = {field: self.groupby(field, query).facets for field in fields}
        for facet, values in query_map.items():
            counts[facet] = {key: self.count(Query.all(query, values[key])) for key in values}
        return counts

    def groupby(
        self, field: str, query, count: Optional[int] = None, start: int = 0, **attrs
    ) -> Groups:
        """Return [Hits][lupyne.engine.documents.Hits] grouped by field
        using a [GroupingSearch][lupyne.engine.documents.GroupingSearch]."""
        return GroupingSearch(field, **attrs).search(self, self.parse(query), count, start)

    def match(self, document: Mapping, *queries) -> Iterator[float]:
        """Generate scores for all queries against a given document mapping."""
        searcher = index.memory.MemoryIndex()
        for name, value in document.items():
            args = [self.analyzer] * isinstance(value, str)
            searcher.addField(name, value, *args)
        return (searcher.search(self.parse(query)) for query in queries)

highlighter: uhighlight.UnifiedHighlighter property

lucene UnifiedHighlighter

count(*query, **options)

Return number of hits for given query or term.

Parameters:

Name Type Description Default
*query

search compatible query, or optimally a name and value

()
**options

additional search options

{}
Source code in lupyne/engine/indexers.py
388
389
390
391
392
393
394
395
396
397
def count(self, *query, **options) -> int:
    """Return number of hits for given query or term.

    Args:
        *query: [search][lupyne.engine.indexers.IndexSearcher.search] compatible query, or optimally a name and value
        **options: additional [search][lupyne.engine.indexers.IndexSearcher.search] options
    """
    if len(query) > 1:
        return self.docFreq(index.Term(*query))
    return super().count(self.parse(*query, **options) if query else Query.alldocs())

facets(query, *fields, **query_map)

Return mapping of document counts for the intersection with each facet.

Parameters:

Name Type Description Default
query

query string or lucene Query

required
*fields str

field names for lucene GroupingSearch

()
**query_map dict

{facet: {key: query, ...}, ...} for intersected query counts

{}
Source code in lupyne/engine/indexers.py
450
451
452
453
454
455
456
457
458
459
460
461
462
def facets(self, query, *fields: str, **query_map: dict) -> dict:
    """Return mapping of document counts for the intersection with each facet.

    Args:
        query: query string or lucene Query
        *fields: field names for lucene GroupingSearch
        **query_map: `{facet: {key: query, ...}, ...}` for intersected query counts
    """
    query = self.parse(query)
    counts = {field: self.groupby(field, query).facets for field in fields}
    for facet, values in query_map.items():
        counts[facet] = {key: self.count(Query.all(query, values[key])) for key in values}
    return counts

get(id, *fields)

Return Document with only selected fields loaded.

Source code in lupyne/engine/indexers.py
348
349
350
def get(self, id: int, *fields: str) -> Document:
    """Return [Document][lupyne.engine.documents.Document] with only selected fields loaded."""
    return Document(self.storedFields().document(id, HashSet(Arrays.asList(fields))))

groupby(field, query, count=None, start=0, **attrs)

Return Hits grouped by field using a GroupingSearch.

Source code in lupyne/engine/indexers.py
464
465
466
467
468
469
def groupby(
    self, field: str, query, count: Optional[int] = None, start: int = 0, **attrs
) -> Groups:
    """Return [Hits][lupyne.engine.documents.Hits] grouped by field
    using a [GroupingSearch][lupyne.engine.documents.GroupingSearch]."""
    return GroupingSearch(field, **attrs).search(self, self.parse(query), count, start)

match(document, *queries)

Generate scores for all queries against a given document mapping.

Source code in lupyne/engine/indexers.py
471
472
473
474
475
476
477
def match(self, document: Mapping, *queries) -> Iterator[float]:
    """Generate scores for all queries against a given document mapping."""
    searcher = index.memory.MemoryIndex()
    for name, value in document.items():
        args = [self.analyzer] * isinstance(value, str)
        searcher.addField(name, value, *args)
    return (searcher.search(self.parse(query)) for query in queries)

reopen()

Return current IndexSearcher.

Only creates a new one if necessary.

Source code in lupyne/engine/indexers.py
332
333
334
335
336
337
338
339
340
341
342
343
def reopen(self) -> 'IndexSearcher':
    """Return current [IndexSearcher][lupyne.engine.indexers.IndexSearcher].

    Only creates a new one if necessary.
    """
    reader = self.openIfChanged()
    if reader is None:
        return self
    other = type(self)(reader, self.analyzer)
    other.decRef()
    other.shared = self.shared
    return other

search(query=None, count=None, sort=None, reverse=False, scores=False, mincount=1000, **parser)

Run query and return Hits.

Note

changed in version 2.3: maxscore option removed; use Hits.maxscore property

Parameters:

Name Type Description Default
query

query string or lucene Query

None
count

maximum number of hits to retrieve

None
sort

lucene Sort parameters

None
reverse

reverse flag used with sort

False
scores

compute scores for candidate results when sorting

False
mincount

total hit count accuracy threshold

1000
**parser

parse` options

{}
Source code in lupyne/engine/indexers.py
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
def search(
    self,
    query=None,
    count=None,
    sort=None,
    reverse=False,
    scores=False,
    mincount=1000,
    **parser,
) -> Hits:
    """Run query and return [Hits][lupyne.engine.documents.Hits].

    Note:
        changed in version 2.3: maxscore option removed; use Hits.maxscore property

    Args:
        query: query string or lucene Query
        count: maximum number of hits to retrieve
        sort: lucene Sort parameters
        reverse: reverse flag used with sort
        scores: compute scores for candidate results when sorting
        mincount: total hit count accuracy threshold
        **parser: [parse][lupyne.engine.analyzers.Analyzer.parse]` options
    """
    query = Query.alldocs() if query is None else self.parse(query, **parser)
    results = cache = collector = self.collector(count, sort, reverse, scores, mincount)
    super().search(query, results)
    if isinstance(cache, search.CachingCollector):
        collector = search.TotalHitCountCollector()
        cache.replay(collector)
        count = collector.totalHits or 1
        collector = self.collector(count, sort, reverse, scores, count)
        cache.replay(collector)
    topdocs = collector.topDocs()
    if scores:
        search.TopFieldCollector.populateScores(topdocs.scoreDocs, self, query)
    return Hits(self, topdocs.scoreDocs, topdocs.totalHits)

spans(query, positions=False)

Generate docs with occurrence counts for a span query.

Parameters:

Name Type Description Default
query SpanQuery

lucene SpanQuery

required
positions

optionally include slice positions instead of counts

False
Source code in lupyne/engine/indexers.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
def spans(self, query: spans.SpanQuery, positions=False) -> Iterator[tuple]:
    """Generate docs with occurrence counts for a span query.

    Args:
        query: lucene SpanQuery
        positions: optionally include slice positions instead of counts
    """
    offset = 0
    weight = query.createWeight(self, search.ScoreMode.COMPLETE_NO_SCORES, 1.0)
    postings = queries.spans.SpanWeight.Postings.POSITIONS
    for reader in self.readers:
        try:
            spans = weight.getSpans(reader.context, postings)
        except lucene.JavaError:  # EOF
            continue
        for doc in iter(spans.nextDoc, spans.NO_MORE_DOCS):
            starts = iter(spans.nextStartPosition, spans.NO_MORE_POSITIONS)
            if positions:
                values = [(start, spans.endPosition()) for start in starts]
            else:
                values = sum(1 for _ in starts)  # type: ignore
            yield (doc + offset), values
        offset += reader.maxDoc()

lupyne.engine.indexers.MultiSearcher

Bases: IndexSearcher

IndexSearcher with underlying lucene MultiReader.

Parameters:

Name Type Description Default
reader

directory paths, Directories, IndexReaders, or a single MultiReader

required
analyzer

lucene Analyzer, default StandardAnalyzer

None
Source code in lupyne/engine/indexers.py
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
class MultiSearcher(IndexSearcher):
    """IndexSearcher with underlying lucene MultiReader.

    Args:
        reader: directory paths, Directories, IndexReaders, or a single MultiReader
        analyzer: lucene Analyzer, default StandardAnalyzer
    """

    def __init__(self, reader, analyzer=None):
        super().__init__(reader, analyzer)
        self.indexReaders = [
            index.DirectoryReader.cast_(context.reader()) for context in self.context.children()
        ]
        self.version = sum(reader.version for reader in self.indexReaders)

    def __getattr__(self, name):
        return getattr(index.MultiReader.cast_(self.indexReader), name)

    def openIfChanged(self):
        readers = list(map(index.DirectoryReader.openIfChanged, self.indexReaders))
        if any(readers):
            readers = [new or old.incRef() or old for new, old in zip(readers, self.indexReaders)]
            return index.MultiReader(readers)

    @property
    def timestamp(self):
        return max(IndexReader(reader).timestamp for reader in self.indexReaders)

lupyne.engine.indexers.IndexWriter

Bases: IndexWriter

Inherited lucene IndexWriter.

Supports setting fields parameters explicitly, so documents can be represented as dictionaries.

Parameters:

Name Type Description Default
directory

directory path or lucene Directory

required
mode str

file mode (rwa), except updating (+) is implied

'a'
analyzer

lucene Analyzer, default StandardAnalyzer

None
version

lucene Version argument passed to IndexWriterConfig, default is latest

None
**attrs

additional attributes to set on IndexWriterConfig

{}
Source code in lupyne/engine/indexers.py
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
class IndexWriter(index.IndexWriter):
    """Inherited lucene IndexWriter.

    Supports setting fields parameters explicitly, so documents can be represented as dictionaries.

    Args:
        directory: directory path or lucene Directory
        mode: file mode (rwa), except updating (+) is implied
        analyzer: lucene Analyzer, default StandardAnalyzer
        version: lucene Version argument passed to IndexWriterConfig, default is latest
        **attrs: additional attributes to set on IndexWriterConfig
    """

    parse = IndexSearcher.parse

    def __init__(self, directory, mode: str = 'a', analyzer=None, version=None, **attrs):
        self.shared = closing()
        args = [] if analyzer is None else [self.shared.analyzer(analyzer)]
        config = index.IndexWriterConfig(*args)
        config.openMode = index.IndexWriterConfig.OpenMode.values()['wra'.index(mode)]
        for name, value in attrs.items():
            setattr(config, name, value)
        self.policy = index.SnapshotDeletionPolicy(config.indexDeletionPolicy)
        config.indexDeletionPolicy = self.policy
        super().__init__(self.shared.directory(directory), config)
        self.fields: dict = {}

    def __del__(self):
        if hash(self):
            with suppress(IOException):
                self.close()

    def __len__(self):
        return self.docStats.numDocs

    @classmethod
    def check(cls, directory, repair=False) -> index.CheckIndex.Status:
        """Check and optionally fix unlocked index, returning lucene CheckIndex.Status."""
        with closing.store(directory) as directory:
            with contextlib.closing(index.CheckIndex(directory)) as checkindex:
                status = checkindex.checkIndex()
                if repair:
                    checkindex.exorciseIndex(status)
        return status

    def set(self, name: str, cls=Field, **settings) -> Field:
        """Assign settings to field name and return the field.

        Args:
            name: registered name of field
            cls: optional [Field][lupyne.engine.documents.Field] constructor
            **settings: stored, indexed, etc. options compatible with [Field][lupyne.engine.documents.Field]
        """
        field = self.fields[name] = cls(name, **settings)
        return field

    def document(self, items=(), **terms) -> document.Document:
        """Return lucene Document from mapping of field names to one or multiple values."""
        doc = document.Document()
        for name, values in dict(items, **terms).items():
            if isinstance(values, Atomic):
                values = (values,)
            for field in self.fields[name].items(*values):
                doc.add(field)
        return doc

    def add(self, document=(), **terms):
        """Add [document][lupyne.engine.indexers.IndexWriter.document] to index with optional boost."""
        self.addDocument(self.document(document, **terms))

    def update(self, name: str, value='', document=(), **terms):
        """Atomically delete documents which match given term
        and add the new [document][lupyne.engine.indexers.IndexWriter.document]."""
        doc = self.document(document, **terms)
        term = index.Term(name, *[value] if value else doc.getValues(name))
        fields = list(doc.iterator())
        types = [Field.cast_(field.fieldType()) for field in fields]
        noindex = index.IndexOptions.NONE
        if any(
            ft.stored() or ft.indexOptions() != noindex or Field.dimensions.fget(ft) for ft in types
        ):
            self.updateDocument(term, doc)
        elif fields:
            self.updateDocValues(term, *fields)

    def delete(self, *query, **options):
        """Remove documents which match given query or term.

        Args:
            *query: [search][lupyne.engine.indexers.IndexSearcher.search] compatible query, or optimally a name and value
            **options: additional [parse][lupyne.engine.analyzers.Analyzer.parse] options
        """
        parse = self.parse if len(query) == 1 else index.Term
        self.deleteDocuments(parse(*query, **options))

    def __iadd__(self, directory):
        """Add directory (or reader, searcher, writer) to index."""
        with closing.store(getattr(directory, 'directory', directory)) as directory:
            self.addIndexes([directory])
        return self

    @contextlib.contextmanager
    def snapshot(self):
        """Return context manager of an index commit snapshot."""
        commit = self.policy.snapshot()
        try:
            yield commit
        finally:
            self.policy.release(commit)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        if any(args):
            self.rollback()
        else:
            self.commit()
        self.close()

__iadd__(directory)

Add directory (or reader, searcher, writer) to index.

Source code in lupyne/engine/indexers.py
604
605
606
607
608
def __iadd__(self, directory):
    """Add directory (or reader, searcher, writer) to index."""
    with closing.store(getattr(directory, 'directory', directory)) as directory:
        self.addIndexes([directory])
    return self

add(document=(), **terms)

Add document to index with optional boost.

Source code in lupyne/engine/indexers.py
575
576
577
def add(self, document=(), **terms):
    """Add [document][lupyne.engine.indexers.IndexWriter.document] to index with optional boost."""
    self.addDocument(self.document(document, **terms))

check(directory, repair=False) classmethod

Check and optionally fix unlocked index, returning lucene CheckIndex.Status.

Source code in lupyne/engine/indexers.py
544
545
546
547
548
549
550
551
552
@classmethod
def check(cls, directory, repair=False) -> index.CheckIndex.Status:
    """Check and optionally fix unlocked index, returning lucene CheckIndex.Status."""
    with closing.store(directory) as directory:
        with contextlib.closing(index.CheckIndex(directory)) as checkindex:
            status = checkindex.checkIndex()
            if repair:
                checkindex.exorciseIndex(status)
    return status

delete(*query, **options)

Remove documents which match given query or term.

Parameters:

Name Type Description Default
*query

search compatible query, or optimally a name and value

()
**options

additional parse options

{}
Source code in lupyne/engine/indexers.py
594
595
596
597
598
599
600
601
602
def delete(self, *query, **options):
    """Remove documents which match given query or term.

    Args:
        *query: [search][lupyne.engine.indexers.IndexSearcher.search] compatible query, or optimally a name and value
        **options: additional [parse][lupyne.engine.analyzers.Analyzer.parse] options
    """
    parse = self.parse if len(query) == 1 else index.Term
    self.deleteDocuments(parse(*query, **options))

document(items=(), **terms)

Return lucene Document from mapping of field names to one or multiple values.

Source code in lupyne/engine/indexers.py
565
566
567
568
569
570
571
572
573
def document(self, items=(), **terms) -> document.Document:
    """Return lucene Document from mapping of field names to one or multiple values."""
    doc = document.Document()
    for name, values in dict(items, **terms).items():
        if isinstance(values, Atomic):
            values = (values,)
        for field in self.fields[name].items(*values):
            doc.add(field)
    return doc

set(name, cls=Field, **settings)

Assign settings to field name and return the field.

Parameters:

Name Type Description Default
name str

registered name of field

required
cls

optional Field constructor

Field
**settings

stored, indexed, etc. options compatible with Field

{}
Source code in lupyne/engine/indexers.py
554
555
556
557
558
559
560
561
562
563
def set(self, name: str, cls=Field, **settings) -> Field:
    """Assign settings to field name and return the field.

    Args:
        name: registered name of field
        cls: optional [Field][lupyne.engine.documents.Field] constructor
        **settings: stored, indexed, etc. options compatible with [Field][lupyne.engine.documents.Field]
    """
    field = self.fields[name] = cls(name, **settings)
    return field

snapshot()

Return context manager of an index commit snapshot.

Source code in lupyne/engine/indexers.py
610
611
612
613
614
615
616
617
@contextlib.contextmanager
def snapshot(self):
    """Return context manager of an index commit snapshot."""
    commit = self.policy.snapshot()
    try:
        yield commit
    finally:
        self.policy.release(commit)

update(name, value='', document=(), **terms)

Atomically delete documents which match given term and add the new document.

Source code in lupyne/engine/indexers.py
579
580
581
582
583
584
585
586
587
588
589
590
591
592
def update(self, name: str, value='', document=(), **terms):
    """Atomically delete documents which match given term
    and add the new [document][lupyne.engine.indexers.IndexWriter.document]."""
    doc = self.document(document, **terms)
    term = index.Term(name, *[value] if value else doc.getValues(name))
    fields = list(doc.iterator())
    types = [Field.cast_(field.fieldType()) for field in fields]
    noindex = index.IndexOptions.NONE
    if any(
        ft.stored() or ft.indexOptions() != noindex or Field.dimensions.fget(ft) for ft in types
    ):
        self.updateDocument(term, doc)
    elif fields:
        self.updateDocValues(term, *fields)

lupyne.engine.indexers.Indexer

Bases: IndexWriter

An all-purpose interface to an index.

Creates an IndexWriter with a delegated IndexSearcher.

Parameters:

Name Type Description Default
nrt

optionally use a near real-time searcher

False
Source code in lupyne/engine/indexers.py
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
class Indexer(IndexWriter):
    """An all-purpose interface to an index.

    Creates an [IndexWriter][lupyne.engine.indexers.IndexWriter]
    with a delegated [IndexSearcher][lupyne.engine.indexers.IndexSearcher].

    Args:
        nrt: optionally use a near real-time searcher
    """

    def __init__(self, directory, mode='a', analyzer=None, version=None, nrt=False, **attrs):
        super().__init__(directory, mode, analyzer, version, **attrs)
        super().commit()
        self.nrt = nrt
        self.indexSearcher = IndexSearcher(self if nrt else self.directory, self.analyzer)

    def __getattr__(self, name):
        if name == 'indexSearcher':
            raise AttributeError(name)
        return getattr(self.indexSearcher, name)

    def __contains__(self, id):
        return id in self.indexSearcher

    def __iter__(self):
        return iter(self.indexSearcher)

    def __getitem__(self, id):
        return self.indexSearcher[id]

    def refresh(self):
        """Store refreshed searcher from [reopening][lupyne.engine.indexers.IndexSearcher.reopen]."""
        self.indexSearcher = self.indexSearcher.reopen()

    def commit(self, merge: int = False):
        """Commit writes and [refresh][lupyne.engine.indexers.Indexer.refresh] searcher.

        Args:
            merge: merge segments with deletes, or optionally specify maximum number of segments
        """
        super().commit()
        if merge:
            if isinstance(merge, bool):
                self.forceMergeDeletes()
            else:
                self.forceMerge(merge)
            super().commit()
        self.refresh()

commit(merge=False)

Commit writes and refresh searcher.

Parameters:

Name Type Description Default
merge int

merge segments with deletes, or optionally specify maximum number of segments

False
Source code in lupyne/engine/indexers.py
664
665
666
667
668
669
670
671
672
673
674
675
676
677
def commit(self, merge: int = False):
    """Commit writes and [refresh][lupyne.engine.indexers.Indexer.refresh] searcher.

    Args:
        merge: merge segments with deletes, or optionally specify maximum number of segments
    """
    super().commit()
    if merge:
        if isinstance(merge, bool):
            self.forceMergeDeletes()
        else:
            self.forceMerge(merge)
        super().commit()
    self.refresh()

refresh()

Store refreshed searcher from reopening.

Source code in lupyne/engine/indexers.py
660
661
662
def refresh(self):
    """Store refreshed searcher from [reopening][lupyne.engine.indexers.IndexSearcher.reopen]."""
    self.indexSearcher = self.indexSearcher.reopen()

documents

lupyne.engine.documents.Document

Bases: dict

Multimapping of field names to values, but default getters return the first value.

Source code in lupyne/engine/documents.py
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
class Document(dict):
    """Multimapping of field names to values, but default getters return the first value."""

    def __init__(self, doc: document.Document):
        for field in doc.iterator():
            value = convert(field.numericValue() or field.stringValue() or field.binaryValue())
            self.setdefault(field.name(), []).append(value)

    def __getitem__(self, name):
        return super().__getitem__(name)[0]

    def get(self, name: str, default=None):
        return super().get(name, [default])[0]

    def getlist(self, name: str) -> list:
        """Return list of all values for given field."""
        return super().get(name, [])

    def dict(self, *names: str, **defaults) -> dict:
        """Return dict representation of document.

        Args:
            *names: names of multi-valued fields to return as a list
            **defaults: include only given fields, using default values as necessary
        """
        defaults |= {name: self[name] for name in (defaults or self) if name in self}
        return defaults | {name: self.getlist(name) for name in names}

dict(*names, **defaults)

Return dict representation of document.

Parameters:

Name Type Description Default
*names str

names of multi-valued fields to return as a list

()
**defaults

include only given fields, using default values as necessary

{}
Source code in lupyne/engine/documents.py
300
301
302
303
304
305
306
307
308
def dict(self, *names: str, **defaults) -> dict:
    """Return dict representation of document.

    Args:
        *names: names of multi-valued fields to return as a list
        **defaults: include only given fields, using default values as necessary
    """
    defaults |= {name: self[name] for name in (defaults or self) if name in self}
    return defaults | {name: self.getlist(name) for name in names}

getlist(name)

Return list of all values for given field.

Source code in lupyne/engine/documents.py
296
297
298
def getlist(self, name: str) -> list:
    """Return list of all values for given field."""
    return super().get(name, [])

lupyne.engine.documents.Hit

Bases: Document

A Document from a search result, with :attr:id, :attr:score, and optional :attr:sortkeys.

Note

changed in version 2.4: keys renamed to :attr:sortkeys

Source code in lupyne/engine/documents.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
class Hit(Document):
    """A Document from a search result, with :attr:`id`, :attr:`score`, and optional :attr:`sortkeys`.

    Note:
        changed in version 2.4: keys renamed to :attr:`sortkeys`
    """

    def __init__(self, doc: document.Document, id: int, score: float, sortkeys=()):
        super().__init__(doc)
        self.id, self.score = id, score
        self.sortkeys = tuple(map(convert, sortkeys))

    def dict(self, *names: str, **defaults) -> dict:
        """Return dict representation of document with __id__, __score__, and any sort __keys__."""
        result = super().dict(*names, **defaults)
        result.update(__id__=self.id, __score__=self.score)
        if self.sortkeys:
            result['__sortkeys__'] = self.sortkeys
        return result

dict(*names, **defaults)

Return dict representation of document with id, score, and any sort keys.

Source code in lupyne/engine/documents.py
323
324
325
326
327
328
329
def dict(self, *names: str, **defaults) -> dict:
    """Return dict representation of document with __id__, __score__, and any sort __keys__."""
    result = super().dict(*names, **defaults)
    result.update(__id__=self.id, __score__=self.score)
    if self.sortkeys:
        result['__sortkeys__'] = self.sortkeys
    return result

lupyne.engine.documents.Hits

Search results: lazily evaluated and memory efficient.

Provides a read-only sequence interface to hit objects.

Note

changed in version 2.3: maxscore option removed; computed property instead

Parameters:

Name Type Description Default
searcher

IndexSearcher which can retrieve documents

required
scoredocs Sequence

lucene ScoreDocs

required
count

total number of hits; float indicates estimate

0
fields

optional field selectors

None
Source code in lupyne/engine/documents.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
class Hits:
    """Search results: lazily evaluated and memory efficient.

    Provides a read-only sequence interface to hit objects.

    Note:
        changed in version 2.3: maxscore option removed; computed property instead

    Args:
        searcher: [IndexSearcher][lupyne.engine.indexers.IndexSearcher] which can retrieve documents
        scoredocs: lucene ScoreDocs
        count: total number of hits; float indicates estimate
        fields: optional field selectors
    """

    def __init__(self, searcher, scoredocs: Sequence, count=0, fields=None):
        self.searcher, self.scoredocs = searcher, scoredocs
        if hasattr(count, 'relation'):
            cls = int if count.relation == search.TotalHits.Relation.EQUAL_TO else float
            count = cls(count.value() if lucene.VERSION.startswith('10.') else count.value)
        self.count, self.fields = count, fields

    def select(self, *fields: str):
        """Only load selected fields."""
        self.fields = HashSet(Arrays.asList(fields))

    def __len__(self):
        return len(self.scoredocs)

    def __getitem__(self, index):
        if isinstance(index, slice):
            scoredocs = list(map(self.scoredocs.__getitem__, range(*index.indices(len(self)))))
            return type(self)(self.searcher, scoredocs, self.count, self.fields)
        scoredoc = self.scoredocs[index]
        keys = search.FieldDoc.cast_(scoredoc).fields if search.FieldDoc.instance_(scoredoc) else ()
        storedFields = self.searcher.storedFields()
        doc = storedFields.document(scoredoc.doc, *([self.fields] * (self.fields is not None)))
        return Hit(doc, scoredoc.doc, scoredoc.score, keys)

    @property
    def ids(self) -> Iterator[int]:
        return map(operator.attrgetter('doc'), self.scoredocs)

    @property
    def scores(self) -> Iterator[float]:
        return map(operator.attrgetter('score'), self.scoredocs)

    @property
    def maxscore(self) -> float:
        """max score of present hits; not necessarily of all matches"""
        return max(self.scores, default=float('nan'))

    def items(self) -> Iterator[tuple]:
        """Generate zipped ids and scores."""
        return map(operator.attrgetter('doc', 'score'), self.scoredocs)

    def highlights(self, query: search.Query, **fields: int) -> Iterator[dict]:
        """Generate highlighted fields for each hit.

        Args:
            query: lucene Query
            **fields: mapping of fields to maxinum number of passages
        """
        mapping = self.searcher.highlighter.highlightFields(
            list(fields), query, list(self.ids), list(fields.values())
        )
        mapping = {field: lucene.JArray_string.cast_(mapping.get(field)) for field in fields}
        return (dict(zip(mapping, values)) for values in zip(*mapping.values()))

    def docvalues(self, field: str, type=None) -> dict:
        """Return mapping of docs to docvalues."""
        return self.searcher.docvalues(field, type).select(self.ids)

    def groupby(
        self, func: Callable, count: Optional[int] = None, docs: Optional[int] = None
    ) -> 'Groups':
        """Return ordered list of [Hits][lupyne.engine.documents.Hits] grouped by value of function applied to doc ids.

        Optionally limit the number of groups and docs per group.
        """
        groups: dict = collections.OrderedDict()
        for scoredoc in self.scoredocs:
            value = func(scoredoc.doc)
            try:
                group = groups[value]
            except KeyError:
                group = groups[value] = type(self)(self.searcher, [], fields=self.fields)
                group.value = value
            group.scoredocs.append(scoredoc)
        groups = list(groups.values())  # type: ignore
        for group in groups:
            group.count = len(group)
            group.scoredocs = group.scoredocs[:docs]
        return Groups(self.searcher, groups[:count], len(groups), self.fields)

    def filter(self, func: Callable) -> 'Hits':
        """Return [Hits][lupyne.engine.documents.Hits] filtered by function applied to doc ids."""
        scoredocs = [scoredoc for scoredoc in self.scoredocs if func(scoredoc.doc)]
        return type(self)(self.searcher, scoredocs, fields=self.fields)

    def sorted(self, key: Callable, reverse=False) -> 'Hits':
        """Return [Hits][lupyne.engine.documents.Hits] sorted by key function applied to doc ids."""
        scoredocs = sorted(self.scoredocs, key=lambda scoredoc: key(scoredoc.doc), reverse=reverse)
        return type(self)(self.searcher, scoredocs, self.count, self.fields)

maxscore: float property

max score of present hits; not necessarily of all matches

docvalues(field, type=None)

Return mapping of docs to docvalues.

Source code in lupyne/engine/documents.py
401
402
403
def docvalues(self, field: str, type=None) -> dict:
    """Return mapping of docs to docvalues."""
    return self.searcher.docvalues(field, type).select(self.ids)

filter(func)

Return Hits filtered by function applied to doc ids.

Source code in lupyne/engine/documents.py
427
428
429
430
def filter(self, func: Callable) -> 'Hits':
    """Return [Hits][lupyne.engine.documents.Hits] filtered by function applied to doc ids."""
    scoredocs = [scoredoc for scoredoc in self.scoredocs if func(scoredoc.doc)]
    return type(self)(self.searcher, scoredocs, fields=self.fields)

groupby(func, count=None, docs=None)

Return ordered list of Hits grouped by value of function applied to doc ids.

Optionally limit the number of groups and docs per group.

Source code in lupyne/engine/documents.py
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
def groupby(
    self, func: Callable, count: Optional[int] = None, docs: Optional[int] = None
) -> 'Groups':
    """Return ordered list of [Hits][lupyne.engine.documents.Hits] grouped by value of function applied to doc ids.

    Optionally limit the number of groups and docs per group.
    """
    groups: dict = collections.OrderedDict()
    for scoredoc in self.scoredocs:
        value = func(scoredoc.doc)
        try:
            group = groups[value]
        except KeyError:
            group = groups[value] = type(self)(self.searcher, [], fields=self.fields)
            group.value = value
        group.scoredocs.append(scoredoc)
    groups = list(groups.values())  # type: ignore
    for group in groups:
        group.count = len(group)
        group.scoredocs = group.scoredocs[:docs]
    return Groups(self.searcher, groups[:count], len(groups), self.fields)

highlights(query, **fields)

Generate highlighted fields for each hit.

Parameters:

Name Type Description Default
query Query

lucene Query

required
**fields int

mapping of fields to maxinum number of passages

{}
Source code in lupyne/engine/documents.py
388
389
390
391
392
393
394
395
396
397
398
399
def highlights(self, query: search.Query, **fields: int) -> Iterator[dict]:
    """Generate highlighted fields for each hit.

    Args:
        query: lucene Query
        **fields: mapping of fields to maxinum number of passages
    """
    mapping = self.searcher.highlighter.highlightFields(
        list(fields), query, list(self.ids), list(fields.values())
    )
    mapping = {field: lucene.JArray_string.cast_(mapping.get(field)) for field in fields}
    return (dict(zip(mapping, values)) for values in zip(*mapping.values()))

items()

Generate zipped ids and scores.

Source code in lupyne/engine/documents.py
384
385
386
def items(self) -> Iterator[tuple]:
    """Generate zipped ids and scores."""
    return map(operator.attrgetter('doc', 'score'), self.scoredocs)

select(*fields)

Only load selected fields.

Source code in lupyne/engine/documents.py
354
355
356
def select(self, *fields: str):
    """Only load selected fields."""
    self.fields = HashSet(Arrays.asList(fields))

sorted(key, reverse=False)

Return Hits sorted by key function applied to doc ids.

Source code in lupyne/engine/documents.py
432
433
434
435
def sorted(self, key: Callable, reverse=False) -> 'Hits':
    """Return [Hits][lupyne.engine.documents.Hits] sorted by key function applied to doc ids."""
    scoredocs = sorted(self.scoredocs, key=lambda scoredoc: key(scoredoc.doc), reverse=reverse)
    return type(self)(self.searcher, scoredocs, self.count, self.fields)

lupyne.engine.documents.Groups

Sequence of grouped Hits.

Source code in lupyne/engine/documents.py
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
class Groups:
    """Sequence of grouped [Hits][lupyne.engine.documents.Hits]."""

    select = Hits.select

    def __init__(self, searcher, groupdocs: Sequence, count: int = 0, fields=None):
        self.searcher, self.groupdocs = searcher, groupdocs
        self.count, self.fields = count, fields

    def __len__(self):
        return len(self.groupdocs)

    def __getitem__(self, index):
        hits = groupdocs = self.groupdocs[index]
        if isinstance(groupdocs, grouping.GroupDocs):
            if lucene.VERSION.startswith('10.'):  # pragma: no cover
                hits = Hits(self.searcher, groupdocs.scoreDocs(), groupdocs.totalHits())
                hits.value = convert(groupdocs.groupValue())
            else:
                hits = Hits(self.searcher, groupdocs.scoreDocs, groupdocs.totalHits)
                hits.value = convert(groupdocs.groupValue)
        hits.fields = self.fields
        return hits

    @property
    def facets(self):
        """mapping of field values and counts"""
        return {hits.value: hits.count for hits in self}

facets property

mapping of field values and counts

lupyne.engine.documents.GroupingSearch

Bases: GroupingSearch

Inherited lucene GroupingSearch with optimized faceting.

Parameters:

Name Type Description Default
field str

unique field name to group by

required
sort

lucene Sort to order groups and docs

None
cache

use unlimited caching

True
**attrs

additional attributes to set

{}
Source code in lupyne/engine/documents.py
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
class GroupingSearch(grouping.GroupingSearch):
    """Inherited lucene GroupingSearch with optimized faceting.

    Args:
        field: unique field name to group by
        sort: lucene Sort to order groups and docs
        cache: use unlimited caching
        **attrs: additional attributes to set
    """

    def __init__(self, field: str, sort=None, cache=True, **attrs):
        super().__init__(field)
        self.field = field
        if sort:
            self.groupSort = self.sortWithinGroup = sort
            self.fillSortFields = True
        if cache:
            self.setCachingInMB(float('inf'), True)
        for name in attrs:
            getattr(type(self), name).__set__(self, attrs[name])

    def __len__(self):
        return self.allMatchingGroups.size()

    def __iter__(self):
        return map(convert, self.allMatchingGroups)

    def search(
        self, searcher, query: search.Query, count: Optional[int] = None, start: int = 0
    ) -> Groups:
        """Run query and return [Groups][lupyne.engine.documents.Groups]."""
        if count is None:
            count = sum(
                index.DocValues.getSorted(reader, self.field).valueCount
                for reader in searcher.readers
            )
        topgroups = super().search(searcher, query, start, max(count - start, 1))
        return Groups(searcher, topgroups.groups, topgroups.totalHitCount)

search(searcher, query, count=None, start=0)

Run query and return Groups.

Source code in lupyne/engine/documents.py
495
496
497
498
499
500
501
502
503
504
505
def search(
    self, searcher, query: search.Query, count: Optional[int] = None, start: int = 0
) -> Groups:
    """Run query and return [Groups][lupyne.engine.documents.Groups]."""
    if count is None:
        count = sum(
            index.DocValues.getSorted(reader, self.field).valueCount
            for reader in searcher.readers
        )
    topgroups = super().search(searcher, query, start, max(count - start, 1))
    return Groups(searcher, topgroups.groups, topgroups.totalHitCount)

lupyne.engine.documents.Field

Bases: FieldType

Saved parameters which can generate lucene Fields given values.

Parameters:

Name Type Description Default
name str

name of field

required
Source code in lupyne/engine/documents.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class Field(FieldType):  # type: ignore
    """Saved parameters which can generate lucene Fields given values.

    Args:
        name: name of field
    """

    docValuesType = property(FieldType.docValuesType, FieldType.setDocValuesType)
    indexOptions = property(FieldType.indexOptions, FieldType.setIndexOptions)
    omitNorms = property(FieldType.omitNorms, FieldType.setOmitNorms)
    stored = property(FieldType.stored, FieldType.setStored)
    storeTermVectorOffsets = property(
        FieldType.storeTermVectorOffsets, FieldType.setStoreTermVectorOffsets
    )
    storeTermVectorPayloads = property(
        FieldType.storeTermVectorPayloads, FieldType.setStoreTermVectorPayloads
    )
    storeTermVectorPositions = property(
        FieldType.storeTermVectorPositions, FieldType.setStoreTermVectorPositions
    )
    storeTermVectors = property(FieldType.storeTermVectors, FieldType.setStoreTermVectors)
    tokenized = property(FieldType.tokenized, FieldType.setTokenized)

    properties = {name for name in locals() if not name.startswith('__')}
    types = {int: 'long', float: 'double', str: 'string'}
    types.update(
        NUMERIC='long', BINARY='string', SORTED='string', SORTED_NUMERIC='long', SORTED_SET='string'
    )
    dimensions = property(
        FieldType.pointDimensionCount,
        lambda self, count: self.setDimensions(count, Long.BYTES),
    )

    def __init__(self, name: str, docValuesType='', indexOptions='', dimensions=0, **settings):
        super().__init__()
        self.name = name
        for name in self.properties.intersection(settings):
            setattr(self, name, settings.pop(name))
        for name in settings:
            raise AttributeError(f"'Field' object has no property '{name}'")
        if dimensions:
            self.dimensions = dimensions
        if indexOptions:
            self.indexOptions = getattr(index.IndexOptions, indexOptions.upper())
        if docValuesType:
            self.docValuesType = getattr(index.DocValuesType, docValuesType.upper())
            name = docValuesType.title().replace('_', '')
            self.docValueClass = getattr(document, name + 'DocValuesField')
            if self.stored or self.indexed or self.dimensions:
                settings = self.settings
                del settings['docValuesType']
                self.docValueLess = Field(self.name, **settings)
        assert self.stored or self.indexed or self.docvalues or self.dimensions

    @classmethod
    def String(
        cls, name: str, tokenized=False, omitNorms=True, indexOptions='DOCS', **settings
    ) -> 'Field':
        """Return Field with default settings for strings."""
        settings.update(tokenized=tokenized, omitNorms=omitNorms, indexOptions=indexOptions)
        return cls(name, **settings)

    @classmethod
    def Text(cls, name: str, indexOptions='DOCS_AND_FREQS_AND_POSITIONS', **settings) -> 'Field':
        """Return Field with default settings for text."""
        return cls(name, indexOptions=indexOptions, **settings)

    @property
    def indexed(self):
        return self.indexOptions != index.IndexOptions.NONE

    @property
    def docvalues(self):
        return self.docValuesType != index.DocValuesType.NONE

    @property
    def settings(self) -> dict:
        """dict representation of settings"""
        defaults = FieldType()
        result = {'dimensions': self.dimensions} if self.dimensions else {}
        for name in Field.properties:
            value = getattr(self, name)
            if value != getattr(defaults, name)():
                result[name] = value if isinstance(value, int) else str(value)
        return result

    def items(self, *values) -> Iterator[document.Field]:
        """Generate lucene Fields suitable for adding to a document."""
        if self.docvalues:
            types = {int: int, float: util.NumericUtils.doubleToSortableLong}
            for value in values:
                yield self.docValueClass(self.name, types.get(type(value), util.BytesRef)(value))
            self = getattr(self, 'docValueLess', self)  # type: ignore
        if self.dimensions:
            for value in values:
                cls = document.LongPoint if isinstance(value, int) else document.DoublePoint
                yield cls(self.name, value)
        if self.indexed:
            for value in values:
                yield document.Field(self.name, value, self)
        elif self.stored:
            for value in values:
                yield document.StoredField(self.name, value)

settings: dict property

dict representation of settings

String(name, tokenized=False, omitNorms=True, indexOptions='DOCS', **settings) classmethod

Return Field with default settings for strings.

Source code in lupyne/engine/documents.py
73
74
75
76
77
78
79
@classmethod
def String(
    cls, name: str, tokenized=False, omitNorms=True, indexOptions='DOCS', **settings
) -> 'Field':
    """Return Field with default settings for strings."""
    settings.update(tokenized=tokenized, omitNorms=omitNorms, indexOptions=indexOptions)
    return cls(name, **settings)

Text(name, indexOptions='DOCS_AND_FREQS_AND_POSITIONS', **settings) classmethod

Return Field with default settings for text.

Source code in lupyne/engine/documents.py
81
82
83
84
@classmethod
def Text(cls, name: str, indexOptions='DOCS_AND_FREQS_AND_POSITIONS', **settings) -> 'Field':
    """Return Field with default settings for text."""
    return cls(name, indexOptions=indexOptions, **settings)

items(*values)

Generate lucene Fields suitable for adding to a document.

Source code in lupyne/engine/documents.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def items(self, *values) -> Iterator[document.Field]:
    """Generate lucene Fields suitable for adding to a document."""
    if self.docvalues:
        types = {int: int, float: util.NumericUtils.doubleToSortableLong}
        for value in values:
            yield self.docValueClass(self.name, types.get(type(value), util.BytesRef)(value))
        self = getattr(self, 'docValueLess', self)  # type: ignore
    if self.dimensions:
        for value in values:
            cls = document.LongPoint if isinstance(value, int) else document.DoublePoint
            yield cls(self.name, value)
    if self.indexed:
        for value in values:
            yield document.Field(self.name, value, self)
    elif self.stored:
        for value in values:
            yield document.StoredField(self.name, value)

lupyne.engine.documents.NestedField

Bases: Field

Field which indexes every component into its own field.

Original value may be stored for convenience.

Parameters:

Name Type Description Default
sep str

field separator used on name and values

'.'
Source code in lupyne/engine/documents.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
class NestedField(Field):
    """Field which indexes every component into its own field.

    Original value may be stored for convenience.

    Args:
        sep: field separator used on name and values
    """

    def __init__(self, name: str, sep: str = '.', **settings):
        super().__init__(name, **Field.String(name, **settings).settings)
        self.sep = sep
        self.names = tuple(self.values(name))

    def values(self, value: str) -> Iterator[str]:
        """Generate component field values in order."""
        values = value.split(self.sep)
        for stop in range(1, len(values) + 1):
            yield self.sep.join(values[:stop])

    def items(self, *values: str) -> Iterator[document.Field]:
        """Generate indexed component fields."""
        field = getattr(self, 'docValueLess', self)
        for value in values:
            for name, text in zip(self.names, self.values(value)):
                yield document.Field(name, text, field)
                if self.docvalues:
                    yield self.docValueClass(name, util.BytesRef(text))

    def prefix(self, value: str) -> Query:
        """Return prefix query of the closest possible prefixed field."""
        index = value.count(self.sep)
        return Query.prefix(self.names[index], value)

    def range(self, start, stop, lower=True, upper=False) -> Query:
        """Return range query of the closest possible prefixed field."""
        index = max(value.count(self.sep) for value in (start, stop) if value is not None)
        return Query.range(self.names[index], start, stop, lower, upper)

items(*values)

Generate indexed component fields.

Source code in lupyne/engine/documents.py
144
145
146
147
148
149
150
151
def items(self, *values: str) -> Iterator[document.Field]:
    """Generate indexed component fields."""
    field = getattr(self, 'docValueLess', self)
    for value in values:
        for name, text in zip(self.names, self.values(value)):
            yield document.Field(name, text, field)
            if self.docvalues:
                yield self.docValueClass(name, util.BytesRef(text))

prefix(value)

Return prefix query of the closest possible prefixed field.

Source code in lupyne/engine/documents.py
153
154
155
156
def prefix(self, value: str) -> Query:
    """Return prefix query of the closest possible prefixed field."""
    index = value.count(self.sep)
    return Query.prefix(self.names[index], value)

range(start, stop, lower=True, upper=False)

Return range query of the closest possible prefixed field.

Source code in lupyne/engine/documents.py
158
159
160
161
def range(self, start, stop, lower=True, upper=False) -> Query:
    """Return range query of the closest possible prefixed field."""
    index = max(value.count(self.sep) for value in (start, stop) if value is not None)
    return Query.range(self.names[index], start, stop, lower, upper)

values(value)

Generate component field values in order.

Source code in lupyne/engine/documents.py
138
139
140
141
142
def values(self, value: str) -> Iterator[str]:
    """Generate component field values in order."""
    values = value.split(self.sep)
    for stop in range(1, len(values) + 1):
        yield self.sep.join(values[:stop])

lupyne.engine.documents.DateTimeField

Bases: Field

Field which indexes datetimes as Point fields of timestamps.

Supports datetimes, dates, and any prefix of time tuples.

Source code in lupyne/engine/documents.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class DateTimeField(Field):
    """Field which indexes datetimes as Point fields of timestamps.

    Supports datetimes, dates, and any prefix of time tuples.
    """

    def __init__(self, name: str, dimensions: int = 1, **settings):
        super().__init__(name, dimensions=dimensions, **settings)

    @classmethod
    def timestamp(cls, date) -> float:
        """Return utc timestamp from date or time tuple."""
        if isinstance(date, datetime.date):
            return calendar.timegm(date.timetuple()) + getattr(date, 'microsecond', 0) * 1e-6
        return float(calendar.timegm(tuple(date) + (None, 1, 1, 0, 0, 0)[len(date) :]))

    def items(self, *dates) -> Iterator[document.Field]:
        """Generate lucene NumericFields of timestamps."""
        return super().items(*map(self.timestamp, dates))

    def range(self, start, stop, **inclusive) -> Query:
        """Return NumericRangeQuery of timestamps."""
        interval = (date and self.timestamp(date) for date in (start, stop))
        return Query.ranges(self.name, interval, **inclusive)

    def prefix(self, date) -> Query:
        """Return range query which matches the date prefix."""
        if isinstance(date, datetime.date):
            date = date.timetuple()[: 6 if isinstance(date, datetime.datetime) else 3]
        if len(date) == 2 and date[1] == 12:  # month must be valid
            return self.range(date, (date[0] + 1, 1))
        return self.range(date, tuple(date[:-1]) + (date[-1] + 1,))

    def duration(self, date, days=0, **delta) -> Query:
        """Return date range query within time span of date.

        Args:
            date: origin date or tuple
            days **delta:: timedelta parameters
        """
        if not isinstance(date, datetime.date):
            date = datetime.datetime(*(tuple(date) + (None, 1, 1)[len(date) :]))
        delta = datetime.timedelta(days, **delta)  # type: ignore
        return self.range(*sorted([date, date + delta]), upper=True)

    def within(self, days=0, weeks=0, tz=None, **delta) -> Query:
        """Return date range query within current time and delta.

        If the delta is an exact number of days, then dates will be used.

        Args:
            days weeks: number of days to offset from today
            tz: optional timezone
            **delta: additional timedelta parameters
        """
        date = datetime.datetime.now(tz)
        if not (isinstance(days + weeks, float) or delta):
            date = date.date()  # type: ignore
        return self.duration(date, days, weeks=weeks, **delta)

duration(date, days=0, **delta)

Return date range query within time span of date.

Parameters:

Name Type Description Default
date

origin date or tuple

required
days **delta

: timedelta parameters

0
Source code in lupyne/engine/documents.py
197
198
199
200
201
202
203
204
205
206
207
def duration(self, date, days=0, **delta) -> Query:
    """Return date range query within time span of date.

    Args:
        date: origin date or tuple
        days **delta:: timedelta parameters
    """
    if not isinstance(date, datetime.date):
        date = datetime.datetime(*(tuple(date) + (None, 1, 1)[len(date) :]))
    delta = datetime.timedelta(days, **delta)  # type: ignore
    return self.range(*sorted([date, date + delta]), upper=True)

items(*dates)

Generate lucene NumericFields of timestamps.

Source code in lupyne/engine/documents.py
180
181
182
def items(self, *dates) -> Iterator[document.Field]:
    """Generate lucene NumericFields of timestamps."""
    return super().items(*map(self.timestamp, dates))

prefix(date)

Return range query which matches the date prefix.

Source code in lupyne/engine/documents.py
189
190
191
192
193
194
195
def prefix(self, date) -> Query:
    """Return range query which matches the date prefix."""
    if isinstance(date, datetime.date):
        date = date.timetuple()[: 6 if isinstance(date, datetime.datetime) else 3]
    if len(date) == 2 and date[1] == 12:  # month must be valid
        return self.range(date, (date[0] + 1, 1))
    return self.range(date, tuple(date[:-1]) + (date[-1] + 1,))

range(start, stop, **inclusive)

Return NumericRangeQuery of timestamps.

Source code in lupyne/engine/documents.py
184
185
186
187
def range(self, start, stop, **inclusive) -> Query:
    """Return NumericRangeQuery of timestamps."""
    interval = (date and self.timestamp(date) for date in (start, stop))
    return Query.ranges(self.name, interval, **inclusive)

timestamp(date) classmethod

Return utc timestamp from date or time tuple.

Source code in lupyne/engine/documents.py
173
174
175
176
177
178
@classmethod
def timestamp(cls, date) -> float:
    """Return utc timestamp from date or time tuple."""
    if isinstance(date, datetime.date):
        return calendar.timegm(date.timetuple()) + getattr(date, 'microsecond', 0) * 1e-6
    return float(calendar.timegm(tuple(date) + (None, 1, 1, 0, 0, 0)[len(date) :]))

within(days=0, weeks=0, tz=None, **delta)

Return date range query within current time and delta.

If the delta is an exact number of days, then dates will be used.

Parameters:

Name Type Description Default
days weeks

number of days to offset from today

0
tz

optional timezone

None
**delta

additional timedelta parameters

{}
Source code in lupyne/engine/documents.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def within(self, days=0, weeks=0, tz=None, **delta) -> Query:
    """Return date range query within current time and delta.

    If the delta is an exact number of days, then dates will be used.

    Args:
        days weeks: number of days to offset from today
        tz: optional timezone
        **delta: additional timedelta parameters
    """
    date = datetime.datetime.now(tz)
    if not (isinstance(days + weeks, float) or delta):
        date = date.date()  # type: ignore
    return self.duration(date, days, weeks=weeks, **delta)

lupyne.engine.documents.ShapeField

Field which indexes geometries: LatLon or XY.

Source code in lupyne/engine/documents.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
class ShapeField:
    """Field which indexes geometries: LatLon or XY."""

    def __init__(self, name: str, indexed=True, docvalues=False):
        self.name, self.indexed, self.docvalues = name, bool(indexed), bool(docvalues)

    def apply(self, func: Callable, shape: geo.Geometry):
        if isinstance(shape, geo.Point):
            return func(self.name, shape.lat, shape.lon)
        if isinstance(shape, geo.XYPoint):
            return func(self.name, shape.x, shape.y)
        return func(self.name, shape)

    def items(self, *shapes: geo.Geometry) -> Iterator[document.Field]:
        """Generate lucene shape fields from geometries."""
        for shape in shapes:
            cls = document.XYShape if isinstance(shape, geo.XYGeometry) else document.LatLonShape
            if self.indexed:
                yield from self.apply(cls.createIndexableFields, shape)
            if self.docvalues:
                yield self.apply(cls.createDocValueField, shape)

    def distances(self, point: Union[geo.Point, geo.XYPoint]) -> search.SortField:
        """Return distance SortField."""
        xy = isinstance(point, geo.XYGeometry)
        cls = document.XYDocValuesField if xy else document.LatLonDocValuesField
        return self.apply(cls.newDistanceSort, point)

    def query(self, relation: QueryRelation, *shapes: geo.Geometry) -> search.Query:  # type: ignore
        shape = shapes[0]
        cls = document.XYShape if isinstance(shape, geo.XYGeometry) else document.LatLonShape
        func = cls.newGeometryQuery
        if isinstance(shape, (geo.Line, geo.XYLine)):
            func = cls.newLineQuery
        if isinstance(shape, (geo.Circle, geo.XYCircle)):
            func = cls.newDistanceQuery
        if isinstance(shape, (geo.Polygon, geo.XYPolygon)):
            func = cls.newPolygonQuery
        return func(self.name, relation, *shapes)

    def contains(self, *shapes: geo.Geometry) -> search.Query:
        """Return shape query with `contains` relation."""
        return self.query(QueryRelation.CONTAINS, *shapes)

    def disjoint(self, *shapes: geo.Geometry) -> search.Query:
        """Return shape query with `disjoint` relation."""
        return self.query(QueryRelation.DISJOINT, *shapes)

    def intersects(self, *shapes: geo.Geometry) -> search.Query:
        """Return shape query with `intersects` relation."""
        return self.query(QueryRelation.INTERSECTS, *shapes)

    def within(self, *shapes: geo.Geometry) -> search.Query:
        """Return shape query with `within` relation."""
        return self.query(QueryRelation.WITHIN, *shapes)

contains(*shapes)

Return shape query with contains relation.

Source code in lupyne/engine/documents.py
265
266
267
def contains(self, *shapes: geo.Geometry) -> search.Query:
    """Return shape query with `contains` relation."""
    return self.query(QueryRelation.CONTAINS, *shapes)

disjoint(*shapes)

Return shape query with disjoint relation.

Source code in lupyne/engine/documents.py
269
270
271
def disjoint(self, *shapes: geo.Geometry) -> search.Query:
    """Return shape query with `disjoint` relation."""
    return self.query(QueryRelation.DISJOINT, *shapes)

distances(point)

Return distance SortField.

Source code in lupyne/engine/documents.py
247
248
249
250
251
def distances(self, point: Union[geo.Point, geo.XYPoint]) -> search.SortField:
    """Return distance SortField."""
    xy = isinstance(point, geo.XYGeometry)
    cls = document.XYDocValuesField if xy else document.LatLonDocValuesField
    return self.apply(cls.newDistanceSort, point)

intersects(*shapes)

Return shape query with intersects relation.

Source code in lupyne/engine/documents.py
273
274
275
def intersects(self, *shapes: geo.Geometry) -> search.Query:
    """Return shape query with `intersects` relation."""
    return self.query(QueryRelation.INTERSECTS, *shapes)

items(*shapes)

Generate lucene shape fields from geometries.

Source code in lupyne/engine/documents.py
238
239
240
241
242
243
244
245
def items(self, *shapes: geo.Geometry) -> Iterator[document.Field]:
    """Generate lucene shape fields from geometries."""
    for shape in shapes:
        cls = document.XYShape if isinstance(shape, geo.XYGeometry) else document.LatLonShape
        if self.indexed:
            yield from self.apply(cls.createIndexableFields, shape)
        if self.docvalues:
            yield self.apply(cls.createDocValueField, shape)

within(*shapes)

Return shape query with within relation.

Source code in lupyne/engine/documents.py
277
278
279
def within(self, *shapes: geo.Geometry) -> search.Query:
    """Return shape query with `within` relation."""
    return self.query(QueryRelation.WITHIN, *shapes)

queries

lupyne.engine.queries.Query

Inherited lucene Query, with dynamic base class acquisition.

Uses class methods and operator overloading for convenient query construction.

Source code in lupyne/engine/queries.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
class Query:
    """Inherited lucene Query, with dynamic base class acquisition.

    Uses class methods and operator overloading for convenient query construction.
    """

    def __new__(cls, base, *args):
        return base.__new__(type(base.__name__, (cls, base), {}))

    def __init__(self, base: search.Query, *args):
        base.__init__(self, *args)

    @classmethod
    def term(cls, name: str, value) -> 'Query':
        """Return lucene TermQuery."""
        return cls(search.TermQuery, index.Term(name, value))

    @classmethod
    def terms(cls, name: str, values) -> 'Query':
        """Return lucene TermInSetQuery, optimizing a SHOULD BooleanQuery of many terms."""
        return cls(search.TermInSetQuery, name, Arrays.asList(list(map(util.BytesRef, values))))

    @classmethod
    def boolean(cls, occur, *queries, **terms):
        builder = search.BooleanQuery.Builder()
        for query in queries:
            builder.add(query, occur)
        for name, values in terms.items():
            for value in [values] if isinstance(values, str) else values:
                builder.add(cls.term(name, value), occur)
        return builder.build()

    @classmethod
    def any(cls, *queries: search.Query, **terms) -> search.BooleanQuery:
        """Return lucene BooleanQuery with SHOULD clauses from queries and terms."""
        return cls.boolean(search.BooleanClause.Occur.SHOULD, *queries, **terms)

    @classmethod
    def all(cls, *queries: search.Query, **terms) -> search.BooleanQuery:
        """Return lucene BooleanQuery with MUST clauses from queries and terms."""
        return cls.boolean(search.BooleanClause.Occur.MUST, *queries, **terms)

    @classmethod
    def filter(cls, *queries: search.Query, **terms) -> search.BooleanQuery:
        """Return lucene BooleanQuery with FILTER clauses from queries and terms."""
        return cls.boolean(search.BooleanClause.Occur.FILTER, *queries, **terms)

    @classmethod
    def disjunct(cls, multiplier, *queries, **terms):
        """Return lucene DisjunctionMaxQuery from queries and terms."""
        queries = list(queries)
        for name, values in terms.items():
            if isinstance(values, str):
                values = [values]
            queries += (cls.term(name, value) for value in values)
        return cls(search.DisjunctionMaxQuery, Arrays.asList(queries), multiplier)

    @classmethod
    def span(cls, *term) -> 'SpanQuery':
        """Return [SpanQuery][lupyne.engine.queries.SpanQuery] from term name and value or a MultiTermQuery."""
        if len(term) <= 1:
            return SpanQuery(spans.SpanMultiTermQueryWrapper, *term)
        return SpanQuery(spans.SpanTermQuery, index.Term(*term))

    @classmethod
    def near(cls, name: str, *values, **kwargs) -> 'SpanQuery':
        """Return [SpanNearQuery][lupyne.engine.queries.SpanQuery.near] from terms.
        Term values which supply another field name will be masked."""
        spans = (
            cls.span(name, value) if isinstance(value, str) else cls.span(*value).mask(name)
            for value in values
        )
        return SpanQuery.near(*spans, **kwargs)

    @classmethod
    def prefix(cls, name: str, value) -> 'Query':
        """Return lucene PrefixQuery."""
        return cls(search.PrefixQuery, index.Term(name, value))

    @classmethod
    def range(cls, name: str, start, stop, lower=True, upper=False) -> 'Query':
        """Return lucene RangeQuery, by default with a half-open interval."""
        start, stop = (value if value is None else util.BytesRef(value) for value in (start, stop))
        return cls(search.TermRangeQuery, name, start, stop, lower, upper)

    @classmethod
    def phrase(cls, name: str, *values, **attrs) -> search.MultiPhraseQuery:
        """Return lucene MultiPhraseQuery.  None may be used as a placeholder."""
        builder = search.MultiPhraseQuery.Builder()
        for attr in attrs:
            setattr(builder, attr, attrs[attr])
        for idx, words in enumerate(values):
            if isinstance(words, str):
                words = [words]
            if words is not None:
                builder.add([index.Term(name, word) for word in words], idx)
        return builder.build()

    @classmethod
    def wildcard(cls, name: str, value) -> 'Query':
        """Return lucene WildcardQuery."""
        return cls(search.WildcardQuery, index.Term(name, value))

    @classmethod
    def fuzzy(cls, name: str, value, *args) -> 'Query':
        """Return lucene FuzzyQuery."""
        return cls(search.FuzzyQuery, index.Term(name, value), *args)

    @classmethod
    def alldocs(cls) -> 'Query':
        """Return lucene MatchAllDocsQuery."""
        return cls(search.MatchAllDocsQuery)

    @classmethod
    def nodocs(cls) -> 'Query':
        """Return lucene MatchNoDocsQuery."""
        return cls(search.MatchNoDocsQuery)

    @classmethod
    def regexp(cls, name: str, value, *args) -> 'Query':
        """Return lucene RegexpQuery."""
        return cls(search.RegexpQuery, index.Term(name, value), *args)

    @staticmethod
    def points(name: str, *values) -> search.Query:
        """Return lucene set query of one dimensional points."""
        if any(isinstance(value, float) for value in values):
            return document.DoublePoint.newSetQuery(name, values)
        return document.LongPoint.newSetQuery(name, tuple(map(int, values)))

    @staticmethod
    def ranges(name: str, *intervals, lower=True, upper=False) -> search.Query:
        """Return lucene multidimensional point range query, by default with half-open intervals."""
        starts, stops = [], []
        for start, stop in intervals:
            if isinstance(start, float) or isinstance(stop, float):
                if start is None:
                    start = Double.NEGATIVE_INFINITY
                elif not lower:
                    start = document.DoublePoint.nextUp(start)
                if stop is None:
                    stop = Double.POSITIVE_INFINITY
                elif not upper:
                    stop = document.DoublePoint.nextDown(stop)
            else:
                if start is None:
                    start = Long.MIN_VALUE
                elif not lower:
                    start += 1
                if stop is None:
                    stop = Long.MAX_VALUE
                elif not upper:
                    stop -= 1
            starts.append(start)
            stops.append(stop)
        if any(isinstance(value, float) for value in starts):
            return document.DoublePoint.newRangeQuery(name, starts, stops)
        return document.LongPoint.newRangeQuery(name, starts, stops)

    def constant(self) -> 'Query':
        """Return lucene ConstantScoreQuery."""
        return Query(search.ConstantScoreQuery, self)

    def boost(self, value: float) -> 'Query':
        """Return lucene BoostQuery."""
        return Query(search.BoostQuery, self, value)

    def __pos__(self) -> search.BooleanQuery:
        """+self"""
        return Query.all(self)

    def __neg__(self) -> search.BooleanQuery:
        """-self"""
        return Query.boolean(search.BooleanClause.Occur.MUST_NOT, self)

    def __and__(self, other: search.Query) -> search.BooleanQuery:
        """+self +other"""
        return Query.all(self, other)

    def __rand__(self, other):
        return Query.all(other, self)

    def __or__(self, other: search.Query) -> search.BooleanQuery:
        """self other"""
        return Query.any(self, other)

    def __ror__(self, other):
        return Query.any(other, self)

    def __sub__(self, other: search.Query) -> search.BooleanQuery:
        """self -other"""
        builder = search.BooleanQuery.Builder()
        builder.add(self, search.BooleanClause.Occur.SHOULD)
        builder.add(other, search.BooleanClause.Occur.MUST_NOT)
        return builder.build()

    def __rsub__(self, other):
        return Query.__sub__(other, self)

__and__(other)

+self +other

Source code in lupyne/engine/queries.py
185
186
187
def __and__(self, other: search.Query) -> search.BooleanQuery:
    """+self +other"""
    return Query.all(self, other)

__neg__()

-self

Source code in lupyne/engine/queries.py
181
182
183
def __neg__(self) -> search.BooleanQuery:
    """-self"""
    return Query.boolean(search.BooleanClause.Occur.MUST_NOT, self)

__or__(other)

self other

Source code in lupyne/engine/queries.py
192
193
194
def __or__(self, other: search.Query) -> search.BooleanQuery:
    """self other"""
    return Query.any(self, other)

__pos__()

+self

Source code in lupyne/engine/queries.py
177
178
179
def __pos__(self) -> search.BooleanQuery:
    """+self"""
    return Query.all(self)

__sub__(other)

self -other

Source code in lupyne/engine/queries.py
199
200
201
202
203
204
def __sub__(self, other: search.Query) -> search.BooleanQuery:
    """self -other"""
    builder = search.BooleanQuery.Builder()
    builder.add(self, search.BooleanClause.Occur.SHOULD)
    builder.add(other, search.BooleanClause.Occur.MUST_NOT)
    return builder.build()

all(*queries, **terms) classmethod

Return lucene BooleanQuery with MUST clauses from queries and terms.

Source code in lupyne/engine/queries.py
47
48
49
50
@classmethod
def all(cls, *queries: search.Query, **terms) -> search.BooleanQuery:
    """Return lucene BooleanQuery with MUST clauses from queries and terms."""
    return cls.boolean(search.BooleanClause.Occur.MUST, *queries, **terms)

alldocs() classmethod

Return lucene MatchAllDocsQuery.

Source code in lupyne/engine/queries.py
118
119
120
121
@classmethod
def alldocs(cls) -> 'Query':
    """Return lucene MatchAllDocsQuery."""
    return cls(search.MatchAllDocsQuery)

any(*queries, **terms) classmethod

Return lucene BooleanQuery with SHOULD clauses from queries and terms.

Source code in lupyne/engine/queries.py
42
43
44
45
@classmethod
def any(cls, *queries: search.Query, **terms) -> search.BooleanQuery:
    """Return lucene BooleanQuery with SHOULD clauses from queries and terms."""
    return cls.boolean(search.BooleanClause.Occur.SHOULD, *queries, **terms)

boost(value)

Return lucene BoostQuery.

Source code in lupyne/engine/queries.py
173
174
175
def boost(self, value: float) -> 'Query':
    """Return lucene BoostQuery."""
    return Query(search.BoostQuery, self, value)

constant()

Return lucene ConstantScoreQuery.

Source code in lupyne/engine/queries.py
169
170
171
def constant(self) -> 'Query':
    """Return lucene ConstantScoreQuery."""
    return Query(search.ConstantScoreQuery, self)

disjunct(multiplier, *queries, **terms) classmethod

Return lucene DisjunctionMaxQuery from queries and terms.

Source code in lupyne/engine/queries.py
57
58
59
60
61
62
63
64
65
@classmethod
def disjunct(cls, multiplier, *queries, **terms):
    """Return lucene DisjunctionMaxQuery from queries and terms."""
    queries = list(queries)
    for name, values in terms.items():
        if isinstance(values, str):
            values = [values]
        queries += (cls.term(name, value) for value in values)
    return cls(search.DisjunctionMaxQuery, Arrays.asList(queries), multiplier)

filter(*queries, **terms) classmethod

Return lucene BooleanQuery with FILTER clauses from queries and terms.

Source code in lupyne/engine/queries.py
52
53
54
55
@classmethod
def filter(cls, *queries: search.Query, **terms) -> search.BooleanQuery:
    """Return lucene BooleanQuery with FILTER clauses from queries and terms."""
    return cls.boolean(search.BooleanClause.Occur.FILTER, *queries, **terms)

fuzzy(name, value, *args) classmethod

Return lucene FuzzyQuery.

Source code in lupyne/engine/queries.py
113
114
115
116
@classmethod
def fuzzy(cls, name: str, value, *args) -> 'Query':
    """Return lucene FuzzyQuery."""
    return cls(search.FuzzyQuery, index.Term(name, value), *args)

near(name, *values, **kwargs) classmethod

Return SpanNearQuery from terms. Term values which supply another field name will be masked.

Source code in lupyne/engine/queries.py
74
75
76
77
78
79
80
81
82
@classmethod
def near(cls, name: str, *values, **kwargs) -> 'SpanQuery':
    """Return [SpanNearQuery][lupyne.engine.queries.SpanQuery.near] from terms.
    Term values which supply another field name will be masked."""
    spans = (
        cls.span(name, value) if isinstance(value, str) else cls.span(*value).mask(name)
        for value in values
    )
    return SpanQuery.near(*spans, **kwargs)

nodocs() classmethod

Return lucene MatchNoDocsQuery.

Source code in lupyne/engine/queries.py
123
124
125
126
@classmethod
def nodocs(cls) -> 'Query':
    """Return lucene MatchNoDocsQuery."""
    return cls(search.MatchNoDocsQuery)

phrase(name, *values, **attrs) classmethod

Return lucene MultiPhraseQuery. None may be used as a placeholder.

Source code in lupyne/engine/queries.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
@classmethod
def phrase(cls, name: str, *values, **attrs) -> search.MultiPhraseQuery:
    """Return lucene MultiPhraseQuery.  None may be used as a placeholder."""
    builder = search.MultiPhraseQuery.Builder()
    for attr in attrs:
        setattr(builder, attr, attrs[attr])
    for idx, words in enumerate(values):
        if isinstance(words, str):
            words = [words]
        if words is not None:
            builder.add([index.Term(name, word) for word in words], idx)
    return builder.build()

points(name, *values) staticmethod

Return lucene set query of one dimensional points.

Source code in lupyne/engine/queries.py
133
134
135
136
137
138
@staticmethod
def points(name: str, *values) -> search.Query:
    """Return lucene set query of one dimensional points."""
    if any(isinstance(value, float) for value in values):
        return document.DoublePoint.newSetQuery(name, values)
    return document.LongPoint.newSetQuery(name, tuple(map(int, values)))

prefix(name, value) classmethod

Return lucene PrefixQuery.

Source code in lupyne/engine/queries.py
84
85
86
87
@classmethod
def prefix(cls, name: str, value) -> 'Query':
    """Return lucene PrefixQuery."""
    return cls(search.PrefixQuery, index.Term(name, value))

range(name, start, stop, lower=True, upper=False) classmethod

Return lucene RangeQuery, by default with a half-open interval.

Source code in lupyne/engine/queries.py
89
90
91
92
93
@classmethod
def range(cls, name: str, start, stop, lower=True, upper=False) -> 'Query':
    """Return lucene RangeQuery, by default with a half-open interval."""
    start, stop = (value if value is None else util.BytesRef(value) for value in (start, stop))
    return cls(search.TermRangeQuery, name, start, stop, lower, upper)

ranges(name, *intervals, lower=True, upper=False) staticmethod

Return lucene multidimensional point range query, by default with half-open intervals.

Source code in lupyne/engine/queries.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@staticmethod
def ranges(name: str, *intervals, lower=True, upper=False) -> search.Query:
    """Return lucene multidimensional point range query, by default with half-open intervals."""
    starts, stops = [], []
    for start, stop in intervals:
        if isinstance(start, float) or isinstance(stop, float):
            if start is None:
                start = Double.NEGATIVE_INFINITY
            elif not lower:
                start = document.DoublePoint.nextUp(start)
            if stop is None:
                stop = Double.POSITIVE_INFINITY
            elif not upper:
                stop = document.DoublePoint.nextDown(stop)
        else:
            if start is None:
                start = Long.MIN_VALUE
            elif not lower:
                start += 1
            if stop is None:
                stop = Long.MAX_VALUE
            elif not upper:
                stop -= 1
        starts.append(start)
        stops.append(stop)
    if any(isinstance(value, float) for value in starts):
        return document.DoublePoint.newRangeQuery(name, starts, stops)
    return document.LongPoint.newRangeQuery(name, starts, stops)

regexp(name, value, *args) classmethod

Return lucene RegexpQuery.

Source code in lupyne/engine/queries.py
128
129
130
131
@classmethod
def regexp(cls, name: str, value, *args) -> 'Query':
    """Return lucene RegexpQuery."""
    return cls(search.RegexpQuery, index.Term(name, value), *args)

span(*term) classmethod

Return SpanQuery from term name and value or a MultiTermQuery.

Source code in lupyne/engine/queries.py
67
68
69
70
71
72
@classmethod
def span(cls, *term) -> 'SpanQuery':
    """Return [SpanQuery][lupyne.engine.queries.SpanQuery] from term name and value or a MultiTermQuery."""
    if len(term) <= 1:
        return SpanQuery(spans.SpanMultiTermQueryWrapper, *term)
    return SpanQuery(spans.SpanTermQuery, index.Term(*term))

term(name, value) classmethod

Return lucene TermQuery.

Source code in lupyne/engine/queries.py
22
23
24
25
@classmethod
def term(cls, name: str, value) -> 'Query':
    """Return lucene TermQuery."""
    return cls(search.TermQuery, index.Term(name, value))

terms(name, values) classmethod

Return lucene TermInSetQuery, optimizing a SHOULD BooleanQuery of many terms.

Source code in lupyne/engine/queries.py
27
28
29
30
@classmethod
def terms(cls, name: str, values) -> 'Query':
    """Return lucene TermInSetQuery, optimizing a SHOULD BooleanQuery of many terms."""
    return cls(search.TermInSetQuery, name, Arrays.asList(list(map(util.BytesRef, values))))

wildcard(name, value) classmethod

Return lucene WildcardQuery.

Source code in lupyne/engine/queries.py
108
109
110
111
@classmethod
def wildcard(cls, name: str, value) -> 'Query':
    """Return lucene WildcardQuery."""
    return cls(search.WildcardQuery, index.Term(name, value))

lupyne.engine.queries.SpanQuery

Bases: Query

Inherited lucene SpanQuery with additional span constructors.

Source code in lupyne/engine/queries.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
class SpanQuery(Query):
    """Inherited lucene SpanQuery with additional span constructors."""

    def __getitem__(self, slc: slice) -> 'SpanQuery':
        start, stop, step = slc.indices(Integer.MAX_VALUE)
        assert step == 1, 'slice step is not supported'
        return SpanQuery(spans.SpanPositionRangeQuery, self, start, stop)

    def __sub__(self, other: spans.SpanQuery) -> 'SpanQuery':
        return SpanQuery(spans.SpanNotQuery, self, other)

    def __or__(*spans_: spans.SpanQuery) -> 'SpanQuery':
        return SpanQuery(spans.SpanOrQuery, spans_)

    def near(*spans_, slop=0, inOrder=True):
        """Return lucene SpanNearQuery from SpanQueries."""
        return SpanQuery(spans.SpanNearQuery, spans_, slop, inOrder)

    def mask(self, name: str) -> 'SpanQuery':
        """Return lucene FieldMaskingSpanQuery, which allows combining span queries from different fields."""
        return SpanQuery(spans.FieldMaskingSpanQuery, self, name)

    def containing(self, other: spans.SpanQuery) -> 'SpanQuery':
        """Return lucene SpanContainingQuery."""
        return SpanQuery(spans.SpanContainingQuery, self, other)

    def within(self, other: spans.SpanQuery) -> 'SpanQuery':
        """Return lucene SpanWithinQuery."""
        return SpanQuery(spans.SpanWithinQuery, self, other)

containing(other)

Return lucene SpanContainingQuery.

Source code in lupyne/engine/queries.py
232
233
234
def containing(self, other: spans.SpanQuery) -> 'SpanQuery':
    """Return lucene SpanContainingQuery."""
    return SpanQuery(spans.SpanContainingQuery, self, other)

mask(name)

Return lucene FieldMaskingSpanQuery, which allows combining span queries from different fields.

Source code in lupyne/engine/queries.py
228
229
230
def mask(self, name: str) -> 'SpanQuery':
    """Return lucene FieldMaskingSpanQuery, which allows combining span queries from different fields."""
    return SpanQuery(spans.FieldMaskingSpanQuery, self, name)

near(*spans_, slop=0, inOrder=True)

Return lucene SpanNearQuery from SpanQueries.

Source code in lupyne/engine/queries.py
224
225
226
def near(*spans_, slop=0, inOrder=True):
    """Return lucene SpanNearQuery from SpanQueries."""
    return SpanQuery(spans.SpanNearQuery, spans_, slop, inOrder)

within(other)

Return lucene SpanWithinQuery.

Source code in lupyne/engine/queries.py
236
237
238
def within(self, other: spans.SpanQuery) -> 'SpanQuery':
    """Return lucene SpanWithinQuery."""
    return SpanQuery(spans.SpanWithinQuery, self, other)

lupyne.engine.queries.SpellParser

Bases: PythonQueryParser

Inherited lucene QueryParser which corrects spelling.

Assign a searcher attribute or override suggest implementation.

Source code in lupyne/engine/queries.py
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
class SpellParser(PythonQueryParser):
    """Inherited lucene QueryParser which corrects spelling.

    Assign a searcher attribute or override [suggest][lupyne.engine.queries.SpellParser.suggest] implementation.
    """

    def suggest(self, term: index.Term) -> index.Term:
        """Return term with text replaced as necessary."""
        field = term.field()
        words = self.searcher.suggest(field, term.text())
        return index.Term(field, *words) if words else term

    def rewrite(self, query: search.Query) -> search.Query:
        """Return term or phrase query with corrected terms substituted."""
        if search.TermQuery.instance_(query):
            term = search.TermQuery.cast_(query).term
            return search.TermQuery(self.suggest(term))
        query = search.PhraseQuery.cast_(query)
        builder = search.PhraseQuery.Builder()
        for position, term in zip(query.positions, query.terms):
            builder.add(self.suggest(term), position)
        return builder.build()

    def getFieldQuery_quoted(self, *args):
        return self.rewrite(self.getFieldQuery_quoted_super(*args))

    def getFieldQuery_slop(self, *args):
        return self.rewrite(self.getFieldQuery_slop_super(*args))

rewrite(query)

Return term or phrase query with corrected terms substituted.

Source code in lupyne/engine/queries.py
294
295
296
297
298
299
300
301
302
303
def rewrite(self, query: search.Query) -> search.Query:
    """Return term or phrase query with corrected terms substituted."""
    if search.TermQuery.instance_(query):
        term = search.TermQuery.cast_(query).term
        return search.TermQuery(self.suggest(term))
    query = search.PhraseQuery.cast_(query)
    builder = search.PhraseQuery.Builder()
    for position, term in zip(query.positions, query.terms):
        builder.add(self.suggest(term), position)
    return builder.build()

suggest(term)

Return term with text replaced as necessary.

Source code in lupyne/engine/queries.py
288
289
290
291
292
def suggest(self, term: index.Term) -> index.Term:
    """Return term with text replaced as necessary."""
    field = term.field()
    words = self.searcher.suggest(field, term.text())
    return index.Term(field, *words) if words else term