Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sort-on multiple indexes is broken#81 #82

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
180 changes: 132 additions & 48 deletions src/Products/ZCatalog/Catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# FOR A PARTICULAR PURPOSE
#
##############################################################################

import heapq
import logging
from bisect import bisect
from collections import defaultdict
Expand Down Expand Up @@ -822,7 +822,7 @@ def _sort_nbest(self, actual_result_count, result, rs,
sort_index, sort_index_length, sort_spec,
second_indexes_key_map):
# Limit / sort results using N-Best algorithm
# This is faster for large sets then a full sort
# This is faster for large sets than a full sort
# And uses far less memory
index_key_map = sort_index.documentToKeyMap()
keys = []
Expand All @@ -848,27 +848,16 @@ def _sort_nbest(self, actual_result_count, result, rs,
worst = keys[0]
result.reverse()
else:
for did in rs:
try:
key = index_key_map[did]
full_key = (key, )
for km in second_indexes_key_map:
full_key += (km[did], )
except KeyError:
# This document is not in the sort key index, skip it.
actual_result_count -= 1
else:
if n >= limit and key <= worst:
continue
i = bisect(keys, key)
keys.insert(i, key)
result.insert(i, (full_key, did, self.__getitem__))
if n == limit:
del keys[0], result[0]
else:
n += 1
worst = keys[0]
result = multisort(result, sort_spec)
# we have multi index sorting
result, actual_result_count = self._multi_index_nbest(
actual_result_count,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving the sorting code into a separate method (which is in general a good idea) prevents actual_result_count to be updated. You must either wrap (int) actual_result_count in a "mutable" object (i.e. updatable in place) or return the updated value as part of the return value (and reassign). ZCatalog filters out hits for which at least one of the sort indexes lacks a value - a test examining the correct actual_result_count thus would ensure that some hits lack a sort value and verify the correct result size.

result,
rs,
limit,
sort_index,
sort_spec,
second_indexes_key_map,
reverse=True)

return (actual_result_count, 0, result)

Expand All @@ -877,11 +866,11 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs,
sort_index, sort_index_length, sort_spec,
second_indexes_key_map):
# Limit / sort results using N-Best algorithm in reverse (N-Worst?)
index_key_map = sort_index.documentToKeyMap()
keys = []
n = 0
best = None
if sort_index_length == 1:
index_key_map = sort_index.documentToKeyMap()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, sort_nbest and sort_nbest_reverse would be symmetric. Your change above seems to have reduced the symmetry.

You have refactored the multi-index sorting into a single method. I would go a step further and merge sort_nbest and sort_nbest_reverse into a single function (in my view those methods are also named unintuitively; the heapq module contains similar functions named nsmallest and nlargest, respectively - which would be much better names). I would use functools.partial to instantiate the proper sorting function from it.

keys = []
n = 0
best = None
for did in rs:
try:
key = index_key_map[did]
Expand All @@ -900,30 +889,125 @@ def _sort_nbest_reverse(self, actual_result_count, result, rs,
n += 1
best = keys[-1]
else:
for did in rs:
try:
key = index_key_map[did]
full_key = (key, )
for km in second_indexes_key_map:
full_key += (km[did], )
except KeyError:
# This document is not in the sort key index, skip it.
actual_result_count -= 1
else:
if n >= limit and key >= best:
continue
i = bisect(keys, key)
keys.insert(i, key)
result.insert(i, (full_key, did, self.__getitem__))
if n == limit:
del keys[-1], result[-1]
else:
n += 1
best = keys[-1]
result = multisort(result, sort_spec)
# we have multi index sorting
result, actual_result_count = self._multi_index_nbest(
actual_result_count,
result,
rs,
limit,
sort_index,
sort_spec,
second_indexes_key_map,
reverse=False
)

return (actual_result_count, 0, result)

def _multi_index_nbest(self, actual_result_count, result,
result_set, limit, sort_index, sort_spec,
second_indexes_key_map, reverse=True):
"""
Example:
Limit = 2
Sorton = (last_name, num)
SortOrder = smallest first
Example data:
meier 3
lopez 4
meier 2
smith 1

Expected sort result:
lopez 4
meier 2

Even if the two first data sets suffice for the requirement limit=2 the third dataset have to be take into account
when sorting on both indexes.

For multiple indexes the strategy is :
1) For the first index get the index_values for all documents
Result :
['meier', 'lopez', 'meier', 'smith']

2) Sort the index_values using heapq to get the 'limit' largest/smallest index_values
Result :
['lopez', 'meier']

3) Find all documents having index_values 'lopez' or 'meier'
meier 3
lopez 4
meier 2

4) Get the index_value for the other search indexes

5) Sort these documents on all indexes.

"""

# ToDo Shortcut for limit>="all"

# Step 1)
# All index_values as a non unique list
index_values = []
# get the mapping of document ID to index value
index_key_map = sort_index.documentToKeyMap()

# get index values for all document IDs (did) of the result set
for did in result_set:
# get the index value of the current document id
try:
index_value = index_key_map[did]
except KeyError:
# This document is not in the sort key index, skip it.
# ToDo: Is this the correct/intended behavior???
actual_result_count -= 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As actual_result_count is an int and thus immutable, this does not change the value in the caller. As mentioned before, you must do something special to get the updated value to the caller (e.g. return the updated value as part or the return value).

else:
# We found a valid document
index_values.append(index_value)

# Step 2)
# Get the 'limit' smallest/largest of index_values
if reverse:
limited_index_values = heapq.nlargest(limit, index_values)
else:
limited_index_values = heapq.nsmallest(limit, index_values)
# we have to revert the results since the following code expect
# the index_values in descending order
limited_index_values.reverse()

# Step 3)

# get the first index_value
last_index_value = limited_index_values[0]
# store all belonging documents
all_documents_for_sorting = list(sort_index._index[last_index_value])
# for all aother index values
for idx, current_index_value in enumerate(limited_index_values[1:]):
# Duplicate checking
if last_index_value != current_index_value:
# We have a fresh index_value
# store all belonging documents
all_documents_for_sorting += list(sort_index._index[current_index_value])
# remenber the index_value for duplicate checking
last_index_value = current_index_value

# Step 4) Get the index_values for the other search indexes
# The sort_set includes the list of index_values per document
sort_set = []
for did in all_documents_for_sorting:
# get the primary index_value
idx = index_key_map[did]
# add the secondary index values
full_key = (idx, )
for km in second_indexes_key_map:
full_key += (km[did], )
sort_set.append((full_key, did, self.__getitem__))

# Step 5) Sort after the secondary indexes.
result = multisort(sort_set, sort_spec)

return result, actual_result_count

def sortResults(self, rs, sort_index,
reverse=False, limit=None, merge=True,
actual_result_count=None, b_start=0, b_size=None):
Expand Down
Loading