From b9fcc396ac9d5c12e2211b17a5c0ab862e1a18eb Mon Sep 17 00:00:00 2001 From: eileen Date: Mon, 10 Feb 2025 18:52:03 +1300 Subject: [PATCH] Take static copy of the query finder for the legacy code function --- CRM/Dedupe/BAO/DedupeRuleGroup.php | 14 +- CRM/Dedupe/FinderQueryOptimizer.php | 48 --- .../Civi/LegacyFinder/Finder.php | 11 +- .../LegacyFinder/FinderQueryOptimizer.php | 276 ++++++++++++++++++ 4 files changed, 291 insertions(+), 58 deletions(-) create mode 100644 ext/legacydedupefinder/Civi/LegacyFinder/FinderQueryOptimizer.php diff --git a/CRM/Dedupe/BAO/DedupeRuleGroup.php b/CRM/Dedupe/BAO/DedupeRuleGroup.php index e494d3bf839c..ad9c64cf1363 100644 --- a/CRM/Dedupe/BAO/DedupeRuleGroup.php +++ b/CRM/Dedupe/BAO/DedupeRuleGroup.php @@ -157,16 +157,20 @@ public static function hook_civicrm_findExistingDuplicates(GenericHookEvent $eve if (empty($tableQueries)) { return; } - $threshold = $ruleGroup->threshold; - $tempTable = $ruleGroup->runTablesQuery([], $tableQueries, $threshold); + $ruleGroup = DedupeRuleGroup::get(FALSE) + ->addWhere('id', '=', $ruleGroup->id) + ->execute() + ->first(); + $self = new self(); + $tempTable = $self->runTablesQuery([], $tableQueries, $ruleGroup['threshold']); if (!$tempTable) { return; } $aclFrom = $aclWhere = ''; $dedupeTable = $tempTable; - $contactType = $ruleGroup->contact_type; - $threshold = $ruleGroup->threshold; + $contactType = $ruleGroup['contact_type']; + $threshold = $ruleGroup['threshold']; if ($event->checkPermissions) { [$aclFrom, $aclWhere] = CRM_Contact_BAO_Contact_Permission::cacheClause(['c1', 'c2']); @@ -192,7 +196,7 @@ public static function hook_civicrm_findExistingDuplicates(GenericHookEvent $eve $duplicates[] = ['entity_id_1' => $dao->id1, 'entity_id_2' => $dao->id2, 'weight' => $dao->weight]; } $event->duplicates = $duplicates; - \CRM_Core_DAO::executeQuery($ruleGroup->tableDropQuery()); + \CRM_Core_DAO::executeQuery('DROP TEMPORARY TABLE IF EXISTS ' . $dedupeTable); } /** diff --git a/CRM/Dedupe/FinderQueryOptimizer.php b/CRM/Dedupe/FinderQueryOptimizer.php index 884984a19d3c..a4ee16a46f92 100644 --- a/CRM/Dedupe/FinderQueryOptimizer.php +++ b/CRM/Dedupe/FinderQueryOptimizer.php @@ -69,30 +69,6 @@ public function __construct(int $dedupeRuleGroupID, array $contactIDs, array $pa } } - /** - * Is a file based reserved query configured. - * - * File based reserved queries were an early idea about how to optimise the dedupe queries. - * - * In theory extensions could implement them although there is no evidence any of them have. - * However, if these are implemented by core or by extensions we should not attempt to optimise - * the query by (e.g.) combining queries. - * - * In practice the queries implemented only return one query anyway - * - * @internal for core use only. - * - * @return bool - * @throws \CRM_Core_Exception - * - * @see \CRM_Dedupe_BAO_QueryBuilder_IndividualGeneral - * @see \CRM_Dedupe_BAO_QueryBuilder_IndividualSupervised - */ - public function isUseReservedQuery(): bool { - return $this->lookup('RuleGroup', 'is_reserved') && - CRM_Utils_File::isIncludable('CRM/Dedupe/BAO/QueryBuilder/' . $this->lookup('RuleGroup', 'name') . '.php'); - } - /** * Return the SQL query for the given rule - either for finding matching * pairs of contacts, or for matching against the $params variable (if set). @@ -228,30 +204,6 @@ private function getContactIDFieldName(string $tableName): string { throw new CRM_Core_Exception('invalid field'); } - /** - * Get the reserved query based on a static class. - * - * This was an early idea about optimisation & extendability. It is likely - * there are no implementations of rules this way outside the 3 core files. - * - * It is also likely the core files can go once we are optimising the queries based on the - * rule. - * - * @internal Do not call from outside of core. - * - * @return array - * @throws \CRM_Core_Exception - */ - public function getReservedQuery(): array { - $bao = new CRM_Dedupe_BAO_DedupeRuleGroup(); - $bao->id = $this->lookup('RuleGroup', 'id'); - $bao->find(TRUE); - $bao->params = $this->lookupParameters; - $bao->contactIds = $this->contactIDs; - $command = empty($this->lookupParameters) ? 'internal' : 'record'; - return call_user_func(["CRM_Dedupe_BAO_QueryBuilder_" . $this->lookup('RuleGroup', 'name'), $command], $bao); - } - /** * Get the queries to fill the table for the various rules. * diff --git a/ext/legacydedupefinder/Civi/LegacyFinder/Finder.php b/ext/legacydedupefinder/Civi/LegacyFinder/Finder.php index b5936d7c421d..17671d584ed4 100644 --- a/ext/legacydedupefinder/Civi/LegacyFinder/Finder.php +++ b/ext/legacydedupefinder/Civi/LegacyFinder/Finder.php @@ -2,6 +2,7 @@ namespace Civi\LegacyFinder; +use Civ\LegacyFinder\FinderQueryOptimizer; use Civi\Core\Event\GenericHookEvent; use Civi\Core\Service\AutoSubscriber; @@ -29,7 +30,7 @@ public static function findExistingDuplicates(GenericHookEvent $event): void { $contactIDs = explode(',', \CRM_Core_DAO::singleValueQuery('SELECT GROUP_CONCAT(id) FROM ' . $event->tableName)); } $ruleGroup->contactIds = $contactIDs; - // make sure we've got a fetched dbrecord, not sure if this is enforced + // make sure we've got a fetched db record, not sure if this is enforced $ruleGroup->find(TRUE); $tempTable = self::fillTable($ruleGroup, $ruleGroup->id, $contactIDs, []); if (!$tempTable) { @@ -116,21 +117,21 @@ public static function findDuplicates(GenericHookEvent $event): void { $event->dedupeResults['ids'] = array_diff($dupes, $event->dedupeParams['excluded_contact_ids']); } - /** * Fill the dedupe finder table. * - * @internal do not access from outside core. + * @internal do not access from outside core * + * @param \CRM_Dedupe_BAO_DedupeRuleGroup $ruleGroup * @param int $id * @param array $contactIDs * @param array $params * * @return false|string - * @throws \Civi\Core\Exception\DBQueryException + * @throws \CRM_Core_Exception */ private static function fillTable($ruleGroup, int $id, array $contactIDs, array $params) { - $optimizer = new \CRM_Dedupe_FinderQueryOptimizer($id, $contactIDs, $params); + $optimizer = new FinderQueryOptimizer($id, $contactIDs, $params); // Reserved Rule Groups can optionally get special treatment by // implementing an optimization class and returning a query array. if ($optimizer->isUseReservedQuery()) { diff --git a/ext/legacydedupefinder/Civi/LegacyFinder/FinderQueryOptimizer.php b/ext/legacydedupefinder/Civi/LegacyFinder/FinderQueryOptimizer.php new file mode 100644 index 000000000000..6c3f718ce8d3 --- /dev/null +++ b/ext/legacydedupefinder/Civi/LegacyFinder/FinderQueryOptimizer.php @@ -0,0 +1,276 @@ +define('DedupeRuleGroup', 'RuleGroup', ['id' => $dedupeRuleGroupID]); + foreach ($contactIDs as $cid) { + $this->contactIDs[] = CRM_Utils_Type::escape($cid, 'Integer'); + } + $this->lookupParameters = $params; + $rules = DedupeRule::get(FALSE) + ->addSelect('*', 'dedupe_rule_group_id.threshold') + ->addWhere('dedupe_rule_group_id', '=', $dedupeRuleGroupID) + ->addOrderBy('rule_weight', 'DESC') + ->execute(); + foreach ($rules as $index => $rule) { + // Filter out the rule if this is a parameters lookup & it is not in the rules. + if (!$this->lookupParameters || (array_key_exists($rule['rule_table'], $this->lookupParameters) && array_key_exists($rule['rule_field'], $this->lookupParameters[$rule['rule_table']]))) { + $key = $rule['rule_table'] . '.' . $rule['rule_field'] . '.' . $rule['rule_weight']; + $this->queries[$key] = [ + 'table' => $rule['rule_table'], + 'field' => $rule['rule_field'], + 'weight' => $rule['rule_weight'], + 'length' => $rule['rule_length'], + 'key' => $key, + 'order' => $index + 1, + ]; + $this->queries[$key]['query'] = $this->getQuery($this->queries[$key]); + } + $this->threshold = $rule['dedupe_rule_group_id.threshold']; + } + } + + /** + * Is a file based reserved query configured. + * + * File based reserved queries were an early idea about how to optimise the dedupe queries. + * + * In theory extensions could implement them although there is no evidence any of them have. + * However, if these are implemented by core or by extensions we should not attempt to optimise + * the query by (e.g.) combining queries. + * + * In practice the queries implemented only return one query anyway + * + * @internal for core use only. + * + * @return bool + * @throws \CRM_Core_Exception + * + * @see \CRM_Dedupe_BAO_QueryBuilder_IndividualGeneral + * @see \CRM_Dedupe_BAO_QueryBuilder_IndividualSupervised + */ + public function isUseReservedQuery(): bool { + return $this->lookup('RuleGroup', 'is_reserved') && + CRM_Utils_File::isIncludable('CRM/Dedupe/BAO/QueryBuilder/' . $this->lookup('RuleGroup', 'name') . '.php'); + } + + /** + * Return the SQL query for the given rule - either for finding matching + * pairs of contacts, or for matching against the $params variable (if set). + * + * @param array $rule + * + * @return string + * SQL query performing the search + * or NULL if params is present and doesn't have and for a field. + * + * @throws \CRM_Core_Exception + * @internal do not call from outside tested core code. No universe uses Feb 2024. + * + */ + public function getQuery(array $rule): ?string { + + $filter = $this->getRuleTableFilter($rule['table']); + $contactIDFieldName = $this->getContactIDFieldName($rule['table']); + + // build FROM (and WHERE, if it's a parametrised search) + // based on whether the rule is about substrings or not + if ($this->lookupParameters) { + $select = "t1.$contactIDFieldName id1, {$rule['weight']} weight"; + $subSelect = 'id1, weight'; + $where = $filter ? ['t1.' . $filter] : []; + $from = "{$rule['table']} t1"; + $str = 'NULL'; + if (isset($this->lookupParameters[$rule['table']][$rule['field']])) { + $str = trim(CRM_Utils_Type::escape($this->lookupParameters[$rule['table']][$rule['field']], 'String')); + } + if ($rule['length']) { + $where[] = "SUBSTR(t1.{$rule['field']}, 1, {$rule['length']}) = SUBSTR('$str', 1, {$rule['length']})"; + $where[] = "t1.{$rule['field']} IS NOT NULL"; + } + else { + $where[] = "t1.{$rule['field']} = '$str'"; + } + } + else { + $select = "t1.$contactIDFieldName id1, t2.$contactIDFieldName id2, {$rule['weight']} weight"; + $subSelect = 'id1, id2, weight'; + $where = $filter ? [ + 't1.' . $filter, + 't2.' . $filter, + ] : []; + $where[] = "t1.$contactIDFieldName < t2.$contactIDFieldName"; + $from = "{$rule['table']} t1 INNER JOIN {$rule['table']} t2 ON (" . self::getRuleFieldFilter($rule) . ")"; + } + + $sql = "SELECT $select FROM $from WHERE " . implode(' AND ', $where); + if ($this->contactIDs) { + $cids = $this->contactIDs; + $sql .= " AND t1.$contactIDFieldName IN (" . implode(',', $cids) . ") + UNION $sql AND t2.$contactIDFieldName IN (" . implode(',', $cids) . ")"; + + // The `weight` is ambiguous in the context of the union; put the whole + // thing in a subquery. + $sql = "SELECT $subSelect FROM ($sql) subunion"; + } + return $sql; + } + + /** + * Get any where filter that restricts the specific table. + * + * Generally this is along the lines of entity_table = civicrm_contact + * although for the contact table it could be the id restriction. + * + * @param string $tableName + * + * @return string + */ + private function getRuleTableFilter(string $tableName): string { + if ($tableName === 'civicrm_contact') { + return "contact_type = '" . $this->lookup('RuleGroup', 'contact_type') . "'"; + } + $dynamicReferences = CRM_Core_DAO::getDynamicReferencesToTable('civicrm_contact')[$tableName] ?? NULL; + if (!$dynamicReferences) { + return ''; + } + if (!empty(CRM_Core_DAO::getDynamicReferencesToTable('civicrm_contact')[$tableName])) { + return $dynamicReferences[1] . "= 'civicrm_contact'"; + } + return ''; + } + + /** + * @param array $rule + * + * @return string + * @throws \CRM_Core_Exception + */ + private function getRuleFieldFilter(array $rule): string { + if ($rule['length']) { + $on = ["SUBSTR(t1.{$rule['field']}, 1, {$rule['length']}) = SUBSTR(t2.{$rule['field']}, 1, {$rule['length']})"]; + return "(" . implode(' AND ', $on) . ")"; + } + $innerJoinClauses = [ + "t1.{$rule['field']} IS NOT NULL", + "t2.{$rule['field']} IS NOT NULL", + "t1.{$rule['field']} = t2.{$rule['field']}", + ]; + + if (in_array(CRM_Dedupe_BAO_DedupeRule::getFieldType($rule['field'], $rule['table']), CRM_Utils_Type::getTextTypes(), TRUE)) { + $innerJoinClauses[] = "t1.{$rule['field']} <> ''"; + $innerJoinClauses[] = "t2.{$rule['field']} <> ''"; + } + return "(" . implode(' AND ', $innerJoinClauses) . ")"; + } + + /** + * Get the name of the field in the table that refers to the Contact ID. + * + * e.g in civicrm_contact this is 'id' whereas in civicrm_address this is + * contact_id and in a custom field table it might be entity_id. + * + * @param string $tableName + * + * @return string + * Usually id, contact_id or entity_id. + * @throws \CRM_Core_Exception + */ + private function getContactIDFieldName(string $tableName): string { + if ($tableName === 'civicrm_contact') { + return 'id'; + } + if (isset(CRM_Core_DAO::getDynamicReferencesToTable('civicrm_contact')[$tableName][0])) { + return CRM_Core_DAO::getDynamicReferencesToTable('civicrm_contact')[$tableName][0]; + } + if (isset(\CRM_Core_DAO::getReferencesToContactTable()[$tableName][0])) { + return \CRM_Core_DAO::getReferencesToContactTable()[$tableName][0]; + } + throw new CRM_Core_Exception('invalid field'); + } + + /** + * Get the reserved query based on a static class. + * + * This was an early idea about optimisation & extendability. It is likely + * there are no implementations of rules this way outside the 3 core files. + * + * It is also likely the core files can go once we are optimising the queries based on the + * rule. + * + * @internal Do not call from outside of core. + * + * @return array + * @throws \CRM_Core_Exception + */ + public function getReservedQuery(): array { + $bao = new CRM_Dedupe_BAO_DedupeRuleGroup(); + $bao->id = $this->lookup('RuleGroup', 'id'); + $bao->find(TRUE); + $bao->params = $this->lookupParameters; + $bao->contactIds = $this->contactIDs; + $command = empty($this->lookupParameters) ? 'internal' : 'record'; + return call_user_func(["CRM_Dedupe_BAO_QueryBuilder_" . $this->lookup('RuleGroup', 'name'), $command], $bao); + } + + /** + * Get the queries to fill the table for the various rules. + * + * Return a set of SQL queries whose cummulative weights will mark matched + * records for the RuleGroup::thresholdQuery() to retrieve. + * + * @internal do not call from outside tested core code. + * + * @return array + * @throws \CRM_Core_Exception + */ + public function getRuleQueries(): array { + $queries = []; + foreach ($this->queries as $rule) { + $queries[$rule['key']] = $rule['query']; + } + return $queries; + } + +}