Skip to content

Commit

Permalink
Update vendored DuckDB sources to 30aae77
Browse files Browse the repository at this point in the history
  • Loading branch information
duckdblabs-bot committed Dec 3, 2024
1 parent 30aae77 commit 47d98d1
Show file tree
Hide file tree
Showing 21 changed files with 425 additions and 160 deletions.
39 changes: 39 additions & 0 deletions src/duckdb/src/common/vector_operations/is_distinct_from.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,16 @@ idx_t PositionComparator::Final<duckdb::DistinctLessThan>(Vector &left, Vector &
return VectorOperations::DistinctGreaterThan(right, left, &sel, count, true_sel, false_sel, null_mask);
}

template <>
idx_t PositionComparator::Final<duckdb::DistinctLessThanNullsFirst>(Vector &left, Vector &right,
const SelectionVector &sel, idx_t count,
optional_ptr<SelectionVector> true_sel,
optional_ptr<SelectionVector> false_sel,
optional_ptr<ValidityMask> null_mask) {
// DistinctGreaterThan has NULLs last
return VectorOperations::DistinctGreaterThan(right, left, &sel, count, true_sel, false_sel, null_mask);
}

template <>
idx_t PositionComparator::Final<duckdb::DistinctGreaterThan>(Vector &left, Vector &right, const SelectionVector &sel,
idx_t count, optional_ptr<SelectionVector> true_sel,
Expand All @@ -510,6 +520,16 @@ idx_t PositionComparator::Final<duckdb::DistinctGreaterThan>(Vector &left, Vecto
return VectorOperations::DistinctGreaterThan(left, right, &sel, count, true_sel, false_sel, null_mask);
}

template <>
idx_t PositionComparator::Final<duckdb::DistinctGreaterThanNullsFirst>(Vector &left, Vector &right,
const SelectionVector &sel, idx_t count,
optional_ptr<SelectionVector> true_sel,
optional_ptr<SelectionVector> false_sel,
optional_ptr<ValidityMask> null_mask) {
// DistinctLessThan has NULLs last
return VectorOperations::DistinctLessThan(right, left, &sel, count, true_sel, false_sel, null_mask);
}

using StructEntries = vector<unique_ptr<Vector>>;

static void ExtractNestedSelection(const SelectionVector &slice_sel, const idx_t count, const SelectionVector &sel,
Expand Down Expand Up @@ -1178,6 +1198,16 @@ idx_t VectorOperations::DistinctGreaterThan(Vector &left, Vector &right, optiona
null_mask);
}

// true := A > B with nulls being minimal
idx_t VectorOperations::DistinctGreaterThanNullsFirst(Vector &left, Vector &right,
optional_ptr<const SelectionVector> sel, idx_t count,
optional_ptr<SelectionVector> true_sel,
optional_ptr<SelectionVector> false_sel,
optional_ptr<ValidityMask> null_mask) {
return TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThanNullsFirst>(left, right, sel, count, true_sel,
false_sel, null_mask);
}

// true := A >= B with nulls being maximal
idx_t VectorOperations::DistinctGreaterThanEquals(Vector &left, Vector &right, optional_ptr<const SelectionVector> sel,
idx_t count, optional_ptr<SelectionVector> true_sel,
Expand All @@ -1195,6 +1225,15 @@ idx_t VectorOperations::DistinctLessThan(Vector &left, Vector &right, optional_p
null_mask);
}

// true := A < B with nulls being minimal
idx_t VectorOperations::DistinctLessThanNullsFirst(Vector &left, Vector &right, optional_ptr<const SelectionVector> sel,
idx_t count, optional_ptr<SelectionVector> true_sel,
optional_ptr<SelectionVector> false_sel,
optional_ptr<ValidityMask> null_mask) {
return TemplatedDistinctSelectOperation<duckdb::DistinctGreaterThanNullsFirst>(right, left, sel, count, true_sel,
false_sel, nullptr);
}

// true := A <= B with nulls being maximal
idx_t VectorOperations::DistinctLessThanEquals(Vector &left, Vector &right, optional_ptr<const SelectionVector> sel,
idx_t count, optional_ptr<SelectionVector> true_sel,
Expand Down
4 changes: 4 additions & 0 deletions src/duckdb/src/execution/aggregate_hashtable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ void GroupedAggregateHashTable::SetRadixBits(idx_t radix_bits_p) {
radix_bits = radix_bits_p;
}

idx_t GroupedAggregateHashTable::GetRadixBits() const {
return radix_bits;
}

void GroupedAggregateHashTable::Resize(idx_t size) {
D_ASSERT(size >= STANDARD_VECTOR_SIZE);
D_ASSERT(IsPowerOfTwo(size));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1495,20 +1495,20 @@ bool StringValueScanner::SkipUntilState(CSVState initial_state, CSVState until_s
bool first_column = true;
const idx_t to_pos = current_iterator.GetEndPos();
while (current_iterator.pos.buffer_pos < to_pos) {
state_machine->Transition(current_state, buffer_handle_ptr[current_iterator.pos.buffer_pos++]);
state_machine_strict->Transition(current_state, buffer_handle_ptr[current_iterator.pos.buffer_pos++]);
if (current_state.IsState(CSVState::STANDARD) || current_state.IsState(CSVState::STANDARD_NEWLINE)) {
while (current_iterator.pos.buffer_pos + 8 < to_pos) {
uint64_t value = Load<uint64_t>(
reinterpret_cast<const_data_ptr_t>(&buffer_handle_ptr[current_iterator.pos.buffer_pos]));
if (ContainsZeroByte((value ^ state_machine->transition_array.delimiter) &
(value ^ state_machine->transition_array.new_line) &
(value ^ state_machine->transition_array.carriage_return) &
(value ^ state_machine->transition_array.comment))) {
if (ContainsZeroByte((value ^ state_machine_strict->transition_array.delimiter) &
(value ^ state_machine_strict->transition_array.new_line) &
(value ^ state_machine_strict->transition_array.carriage_return) &
(value ^ state_machine_strict->transition_array.comment))) {
break;
}
current_iterator.pos.buffer_pos += 8;
}
while (state_machine->transition_array
while (state_machine_strict->transition_array
.skip_standard[static_cast<uint8_t>(buffer_handle_ptr[current_iterator.pos.buffer_pos])] &&
current_iterator.pos.buffer_pos < to_pos - 1) {
current_iterator.pos.buffer_pos++;
Expand All @@ -1518,14 +1518,14 @@ bool StringValueScanner::SkipUntilState(CSVState initial_state, CSVState until_s
while (current_iterator.pos.buffer_pos + 8 < to_pos) {
uint64_t value = Load<uint64_t>(
reinterpret_cast<const_data_ptr_t>(&buffer_handle_ptr[current_iterator.pos.buffer_pos]));
if (ContainsZeroByte((value ^ state_machine->transition_array.quote) &
(value ^ state_machine->transition_array.escape))) {
if (ContainsZeroByte((value ^ state_machine_strict->transition_array.quote) &
(value ^ state_machine_strict->transition_array.escape))) {
break;
}
current_iterator.pos.buffer_pos += 8;
}

while (state_machine->transition_array
while (state_machine_strict->transition_array
.skip_quoted[static_cast<uint8_t>(buffer_handle_ptr[current_iterator.pos.buffer_pos])] &&
current_iterator.pos.buffer_pos < to_pos - 1) {
current_iterator.pos.buffer_pos++;
Expand All @@ -1535,7 +1535,7 @@ bool StringValueScanner::SkipUntilState(CSVState initial_state, CSVState until_s
current_state.IsState(CSVState::RECORD_SEPARATOR)) &&
first_column) {
if (buffer_handle_ptr[current_iterator.pos.buffer_pos - 1] ==
state_machine->dialect_options.state_machine_options.quote.GetValue()) {
state_machine_strict->dialect_options.state_machine_options.quote.GetValue()) {
quoted = true;
}
}
Expand Down Expand Up @@ -1586,7 +1586,7 @@ bool StringValueScanner::IsRowValid(CSVIterator &current_iterator) const {
}
constexpr idx_t result_size = 1;
auto scan_finder =
make_uniq<StringValueScanner>(0U, buffer_manager, state_machine, make_shared_ptr<CSVErrorHandler>(),
make_uniq<StringValueScanner>(0U, buffer_manager, state_machine_strict, make_shared_ptr<CSVErrorHandler>(),
csv_file_scan, false, current_iterator, result_size);
auto &tuples = scan_finder->ParseChunk();
current_iterator.pos = scan_finder->GetIteratorPosition();
Expand Down Expand Up @@ -1629,6 +1629,17 @@ void StringValueScanner::SetStart() {
if (iterator.GetEndPos() > cur_buffer_handle->actual_size) {
iterator.SetEnd(cur_buffer_handle->actual_size);
}
if (!state_machine_strict) {
// We need to initialize our strict state machine
auto &state_machine_cache = CSVStateMachineCache::Get(buffer_manager->context);
auto state_options = state_machine->state_machine_options;
// To set the state machine to be strict we ensure that rfc_4180 is set to true
if (!state_options.rfc_4180.IsSetByUser()) {
state_options.rfc_4180 = true;
}
state_machine_strict =
make_shared_ptr<CSVStateMachine>(state_machine_cache.Get(state_options), state_machine->options);
}
// At this point we have 3 options:
// 1. We are at the start of a valid line
ValidRowInfo best_row = TryRow(CSVState::STANDARD_NEWLINE, iterator.pos.buffer_pos, iterator.GetEndPos());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,11 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc
} else {
new_line_id = DetectNewLineDelimiter(*buffer_manager);
}
bool rfc_4180 = options.dialect_options.state_machine_options.rfc_4180.GetValue();
// We only sniff RFC 4180 rules, unless manually set by user.
bool rfc_4180 = true;
if (options.dialect_options.state_machine_options.rfc_4180.IsSetByUser()) {
rfc_4180 = options.dialect_options.state_machine_options.rfc_4180.GetValue();
}
CSVIterator first_iterator;
bool iterator_set = false;
for (const auto quote_rule : dialect_candidates.quote_rule_candidates) {
Expand Down Expand Up @@ -370,18 +374,25 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
if (!scanner->ever_escaped && candidates.front()->ever_escaped) {
return;
}
if (best_consistent_rows == consistent_rows) {
if (best_consistent_rows == consistent_rows && num_cols >= max_columns_found) {
// If both have not been escaped, this might get solved later on.
sniffing_state_machine.dialect_options.num_cols = num_cols;
candidates.emplace_back(std::move(scanner));
max_columns_found = num_cols;
return;
}
}
}
if (max_columns_found == num_cols && ignored_rows > min_ignored_rows) {
return;
}

if (quoted && num_cols < max_columns_found) {
for (auto &candidate : candidates) {
if (candidate->ever_quoted) {
return;
}
}
}
best_consistent_rows = consistent_rows;
max_columns_found = num_cols;
prev_padding_count = padding_count;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,21 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
auto &transition_array = state_machine_cache[state_machine_options];

for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) {
CSVState cur_state = static_cast<CSVState>(i);
const auto cur_state = static_cast<CSVState>(i);
switch (cur_state) {
case CSVState::QUOTED:
case CSVState::QUOTED_NEW_LINE:
case CSVState::ESCAPE:
InitializeTransitionArray(transition_array, cur_state, CSVState::QUOTED);
break;
case CSVState::UNQUOTED:
InitializeTransitionArray(transition_array, cur_state, CSVState::INVALID);
if (state_machine_options.rfc_4180.GetValue()) {
// If we have an unquoted state, following rfc 4180, our base state is invalid
InitializeTransitionArray(transition_array, cur_state, CSVState::INVALID);
} else {
// This will allow us to accept unescaped quotes
InitializeTransitionArray(transition_array, cur_state, CSVState::UNQUOTED);
}
break;
case CSVState::COMMENT:
InitializeTransitionArray(transition_array, cur_state, CSVState::COMMENT);
Expand All @@ -41,19 +47,19 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
}
}

const string delimiter_value = state_machine_options.delimiter.GetValue();
uint8_t delimiter_first_byte = static_cast<uint8_t>(delimiter_value[0]);
uint8_t quote = static_cast<uint8_t>(state_machine_options.quote.GetValue());
uint8_t escape = static_cast<uint8_t>(state_machine_options.escape.GetValue());
uint8_t comment = static_cast<uint8_t>(state_machine_options.comment.GetValue());
const auto delimiter_value = state_machine_options.delimiter.GetValue();
const auto delimiter_first_byte = static_cast<uint8_t>(delimiter_value[0]);
const auto quote = static_cast<uint8_t>(state_machine_options.quote.GetValue());
const auto escape = static_cast<uint8_t>(state_machine_options.escape.GetValue());
const auto comment = static_cast<uint8_t>(state_machine_options.comment.GetValue());

auto new_line_id = state_machine_options.new_line.GetValue();
const auto new_line_id = state_machine_options.new_line.GetValue();

const bool multi_byte_delimiter = delimiter_value.size() != 1;

bool enable_unquoted_escape = state_machine_options.rfc_4180.GetValue() == false &&
state_machine_options.quote != state_machine_options.escape;

state_machine_options.quote != state_machine_options.escape &&
state_machine_options.escape != '\0';
// Now set values depending on configuration
// 1) Standard/Invalid State
vector<uint8_t> std_inv {static_cast<uint8_t>(CSVState::STANDARD), static_cast<uint8_t>(CSVState::INVALID),
Expand Down Expand Up @@ -190,7 +196,8 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
transition_array['\n'][static_cast<uint8_t>(CSVState::QUOTED)] = CSVState::QUOTED_NEW_LINE;
transition_array['\r'][static_cast<uint8_t>(CSVState::QUOTED)] = CSVState::QUOTED_NEW_LINE;

if (state_machine_options.quote != state_machine_options.escape) {
if (state_machine_options.quote != state_machine_options.escape &&
state_machine_options.escape.GetValue() != '\0') {
transition_array[escape][static_cast<uint8_t>(CSVState::QUOTED)] = CSVState::ESCAPE;
}
// 6) Unquoted State
Expand All @@ -209,7 +216,10 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
transition_array[delimiter_first_byte][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::DELIMITER;
}
if (state_machine_options.quote == state_machine_options.escape) {
transition_array[escape][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
}
if (state_machine_options.escape == '\0' && state_machine_options.rfc_4180 == false) {
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
}
if (comment != '\0') {
transition_array[comment][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::COMMENT;
Expand Down Expand Up @@ -243,7 +253,8 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op

// 9) Quoted NewLine
transition_array[quote][static_cast<uint8_t>(CSVState::QUOTED_NEW_LINE)] = CSVState::UNQUOTED;
if (state_machine_options.quote != state_machine_options.escape) {
if (state_machine_options.quote != state_machine_options.escape &&
state_machine_options.escape.GetValue() != '\0') {
transition_array[escape][static_cast<uint8_t>(CSVState::QUOTED_NEW_LINE)] = CSVState::ESCAPE;
}

Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/src/execution/operator/join/physical_iejoin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ class IEJoinGlobalSourceState : public GlobalSourceState {
const auto r = MinValue(next_right.load(), right_outers.load());
const auto returned = completed.load() + l + r;

return count ? (double(returned) / double(count)) : -1;
return count ? (100.0 * double(returned) / double(count)) : -1;
}

const PhysicalIEJoin &op;
Expand Down
Loading

0 comments on commit 47d98d1

Please sign in to comment.