-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
sort-on multiple indexes is broken#81 #82
base: master
Are you sure you want to change the base?
Changes from all commits
697719c
6eaef16
8a9608c
78fd575
bab34d7
88de79d
85611a5
5686c7a
1ff6b5f
87f0149
9141f94
a17c1f7
cedeca3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
# FOR A PARTICULAR PURPOSE | ||
# | ||
############################################################################## | ||
|
||
import heapq | ||
import logging | ||
from bisect import bisect | ||
from collections import defaultdict | ||
|
@@ -822,7 +822,7 @@ def _sort_nbest(self, actual_result_count, result, rs, | |
sort_index, sort_index_length, sort_spec, | ||
second_indexes_key_map): | ||
# Limit / sort results using N-Best algorithm | ||
# This is faster for large sets then a full sort | ||
# This is faster for large sets than a full sort | ||
# And uses far less memory | ||
index_key_map = sort_index.documentToKeyMap() | ||
keys = [] | ||
|
@@ -848,27 +848,16 @@ def _sort_nbest(self, actual_result_count, result, rs, | |
worst = keys[0] | ||
result.reverse() | ||
else: | ||
for did in rs: | ||
try: | ||
key = index_key_map[did] | ||
full_key = (key, ) | ||
for km in second_indexes_key_map: | ||
full_key += (km[did], ) | ||
except KeyError: | ||
# This document is not in the sort key index, skip it. | ||
actual_result_count -= 1 | ||
else: | ||
if n >= limit and key <= worst: | ||
continue | ||
i = bisect(keys, key) | ||
keys.insert(i, key) | ||
result.insert(i, (full_key, did, self.__getitem__)) | ||
if n == limit: | ||
del keys[0], result[0] | ||
else: | ||
n += 1 | ||
worst = keys[0] | ||
result = multisort(result, sort_spec) | ||
# we have multi index sorting | ||
result, actual_result_count = self._multi_index_nbest( | ||
actual_result_count, | ||
result, | ||
rs, | ||
limit, | ||
sort_index, | ||
sort_spec, | ||
second_indexes_key_map, | ||
reverse=True) | ||
|
||
return (actual_result_count, 0, result) | ||
|
||
|
@@ -877,11 +866,11 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs, | |
sort_index, sort_index_length, sort_spec, | ||
second_indexes_key_map): | ||
# Limit / sort results using N-Best algorithm in reverse (N-Worst?) | ||
index_key_map = sort_index.documentToKeyMap() | ||
keys = [] | ||
n = 0 | ||
best = None | ||
if sort_index_length == 1: | ||
index_key_map = sort_index.documentToKeyMap() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally, You have refactored the multi-index sorting into a single method. I would go a step further and merge |
||
keys = [] | ||
n = 0 | ||
best = None | ||
for did in rs: | ||
try: | ||
key = index_key_map[did] | ||
|
@@ -900,30 +889,125 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs, | |
n += 1 | ||
best = keys[-1] | ||
else: | ||
for did in rs: | ||
try: | ||
key = index_key_map[did] | ||
full_key = (key, ) | ||
for km in second_indexes_key_map: | ||
full_key += (km[did], ) | ||
except KeyError: | ||
# This document is not in the sort key index, skip it. | ||
actual_result_count -= 1 | ||
else: | ||
if n >= limit and key >= best: | ||
continue | ||
i = bisect(keys, key) | ||
keys.insert(i, key) | ||
result.insert(i, (full_key, did, self.__getitem__)) | ||
if n == limit: | ||
del keys[-1], result[-1] | ||
else: | ||
n += 1 | ||
best = keys[-1] | ||
result = multisort(result, sort_spec) | ||
# we have multi index sorting | ||
result, actual_result_count = self._multi_index_nbest( | ||
actual_result_count, | ||
result, | ||
rs, | ||
limit, | ||
sort_index, | ||
sort_spec, | ||
second_indexes_key_map, | ||
reverse=False | ||
) | ||
|
||
return (actual_result_count, 0, result) | ||
|
||
def _multi_index_nbest(self, actual_result_count, result, | ||
result_set, limit, sort_index, sort_spec, | ||
second_indexes_key_map, reverse=True): | ||
""" | ||
Example: | ||
Limit = 2 | ||
Sorton = (last_name, num) | ||
SortOrder = smallest first | ||
Example data: | ||
meier 3 | ||
lopez 4 | ||
meier 2 | ||
smith 1 | ||
|
||
Expected sort result: | ||
lopez 4 | ||
meier 2 | ||
|
||
Even if the two first data sets suffice for the requirement limit=2 the third dataset have to be take into account | ||
when sorting on both indexes. | ||
|
||
For multiple indexes the strategy is : | ||
1) For the first index get the index_values for all documents | ||
Result : | ||
['meier', 'lopez', 'meier', 'smith'] | ||
|
||
2) Sort the index_values using heapq to get the 'limit' largest/smallest index_values | ||
Result : | ||
['lopez', 'meier'] | ||
|
||
3) Find all documents having index_values 'lopez' or 'meier' | ||
meier 3 | ||
lopez 4 | ||
meier 2 | ||
|
||
4) Get the index_value for the other search indexes | ||
|
||
5) Sort these documents on all indexes. | ||
|
||
""" | ||
|
||
# ToDo Shortcut for limit>="all" | ||
|
||
# Step 1) | ||
# All index_values as a non unique list | ||
index_values = [] | ||
# get the mapping of document ID to index value | ||
index_key_map = sort_index.documentToKeyMap() | ||
|
||
# get index values for all document IDs (did) of the result set | ||
for did in result_set: | ||
# get the index value of the current document id | ||
try: | ||
index_value = index_key_map[did] | ||
except KeyError: | ||
# This document is not in the sort key index, skip it. | ||
# ToDo: Is this the correct/intended behavior??? | ||
actual_result_count -= 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As |
||
else: | ||
# We found a valid document | ||
index_values.append(index_value) | ||
|
||
# Step 2) | ||
# Get the 'limit' smallest/largest of index_values | ||
if reverse: | ||
limited_index_values = heapq.nlargest(limit, index_values) | ||
else: | ||
limited_index_values = heapq.nsmallest(limit, index_values) | ||
# we have to revert the results since the following code expect | ||
# the index_values in descending order | ||
limited_index_values.reverse() | ||
|
||
# Step 3) | ||
|
||
# get the first index_value | ||
last_index_value = limited_index_values[0] | ||
# store all belonging documents | ||
all_documents_for_sorting = list(sort_index._index[last_index_value]) | ||
# for all aother index values | ||
for idx, current_index_value in enumerate(limited_index_values[1:]): | ||
# Duplicate checking | ||
if last_index_value != current_index_value: | ||
# We have a fresh index_value | ||
# store all belonging documents | ||
all_documents_for_sorting += list(sort_index._index[current_index_value]) | ||
# remenber the index_value for duplicate checking | ||
last_index_value = current_index_value | ||
|
||
# Step 4) Get the index_values for the other search indexes | ||
# The sort_set includes the list of index_values per document | ||
sort_set = [] | ||
for did in all_documents_for_sorting: | ||
# get the primary index_value | ||
idx = index_key_map[did] | ||
# add the secondary index values | ||
full_key = (idx, ) | ||
for km in second_indexes_key_map: | ||
full_key += (km[did], ) | ||
sort_set.append((full_key, did, self.__getitem__)) | ||
|
||
# Step 5) Sort after the secondary indexes. | ||
result = multisort(sort_set, sort_spec) | ||
|
||
return result, actual_result_count | ||
|
||
def sortResults(self, rs, sort_index, | ||
reverse=False, limit=None, merge=True, | ||
actual_result_count=None, b_start=0, b_size=None): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving the sorting code into a separate method (which is in general a good idea) prevents
actual_result_count
to be updated. You must either wrap (int
)actual_result_count
in a "mutable" object (i.e. updatable in place) or return the updated value as part of the return value (and reassign).ZCatalog
filters out hits for which at least one of the sort indexes lacks a value - a test examining the correctactual_result_count
thus would ensure that some hits lack a sort value and verify the correct result size.