Skip to content

Commit

Permalink
Fix readin FLAOT from multiple row groups
Browse files Browse the repository at this point in the history
  • Loading branch information
gaborcsardi committed Feb 8, 2025
1 parent c53fcc1 commit 8a5fb13
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 3 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
* `write_parquet()` now correctly converts double `Date` columns
to integer columns (@eitsupi, #116).

* `read_parquet()` now correctly reads `FLOAT` columns from files with
multiple row groups.

# nanoparquet 0.4.0

* API changes:
Expand Down
12 changes: 9 additions & 3 deletions src/RParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,12 @@ rtype::rtype(parquet::SchemaElement &sel) {
case parquet::Type::BOOLEAN:
type = tmptype = LGLSXP;
elsize = sizeof(int);
psize = 0; // not really true or course...
break;
case parquet::Type::INT32:
type = tmptype = INTSXP;
elsize = sizeof(int);
psize = 4;
if ((sel.__isset.logicalType && sel.logicalType.__isset.DATE) ||
(sel.__isset.converted_type &&
sel.converted_type == parquet::ConvertedType::DATE)) {
Expand Down Expand Up @@ -279,6 +281,7 @@ rtype::rtype(parquet::SchemaElement &sel) {
type = tmptype = REALSXP;
type_conversion = INT64_DOUBLE;
elsize = sizeof(double);
psize = 8;
if ((sel.__isset.logicalType &&
sel.logicalType.__isset.TIMESTAMP &&
(sel.logicalType.TIMESTAMP.unit.__isset.MILLIS ||
Expand Down Expand Up @@ -345,6 +348,7 @@ rtype::rtype(parquet::SchemaElement &sel) {
tmptype = INTSXP;
type_conversion = INT96_DOUBLE;
elsize = sizeof(int) * 3;
psize = 8 * 3;
rsize = 3;
classes.push_back("POSIXct");
classes.push_back("POSIXt");
Expand All @@ -354,10 +358,12 @@ rtype::rtype(parquet::SchemaElement &sel) {
type = tmptype = REALSXP;
type_conversion = FLOAT_DOUBLE;
elsize = sizeof(double);
psize = 4;
break;
case parquet::Type::DOUBLE:
type = tmptype = REALSXP;
elsize = sizeof(double);
psize = 8;
break;
case parquet::Type::BYTE_ARRAY:
case parquet::Type::FIXED_LEN_BYTE_ARRAY:
Expand Down Expand Up @@ -469,10 +475,10 @@ void RParquetReader::alloc_data_page(DataPage &data) {
} else if (!rt.byte_array) {
int64_t off = metadata.row_group_offsets[rg];
if (tmpdata[cl].size() > 0) {
data.data = tmpdata[cl].data() + (off + page_off) * rt.elsize;
// only for int96 currently
data.data = tmpdata[cl].data() + off * rt.elsize + page_off * rt.psize;
} else {
data.data = metadata.dataptr[cl] +
(off + page_off) * (rt.elsize / rt.rsize);
data.data = metadata.dataptr[cl] + off * rt.elsize + page_off * rt.psize;
}
} else {
tmpbytes bapage;
Expand Down
2 changes: 2 additions & 0 deletions src/RParquetReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class rtype {
int tmptype = 0;
// size of tmptype or type in bytes
int elsize;
// size in parquet (with PLAIN encoding)
int psize;
// number of R tmptype elements for 1 Parquet element
int rsize = 1;
std::vector<std::string> classes;
Expand Down
Binary file added tests/testthat/data/float.parquet
Binary file not shown.
9 changes: 9 additions & 0 deletions tests/testthat/test-read-parquet-2.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,15 @@ test_that("FLOAT", {
})
})

test_that("FLOAT from multiple row groups and pages", {
skip_on_cran()
pf <- test_path("data/float.parquet")
expect_equal(
as.data.frame(arrow::read_parquet(pf)),
as.data.frame(read_parquet(pf))
)
})

test_that("DOUBLE", {
tmp <- tempfile(fileext = ".parquet")
on.exit(unlink(tmp), add = TRUE)
Expand Down

0 comments on commit 8a5fb13

Please sign in to comment.