Skip to content

Commit

Permalink
perf(rust, python): optimize string kernels, (elide redundant allocs) (
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Mar 17, 2023
1 parent 5836aca commit 9b66ab3
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 58 deletions.
31 changes: 31 additions & 0 deletions polars/polars-ops/src/chunked_array/strings/case.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use polars_core::prelude::Utf8Chunked;

pub(super) fn to_lowercase<'a>(ca: &'a Utf8Chunked) -> Utf8Chunked {
// amortize allocation
let mut buf = String::new();
let f = |s: &'a str| {
buf.clear();
buf.push_str(s);
buf.make_ascii_lowercase();
// extend lifetime
// lifetime is bound to 'a
let slice = buf.as_str();
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
};
ca.apply_mut(f)
}

pub(super) fn to_uppercase<'a>(ca: &'a Utf8Chunked) -> Utf8Chunked {
// amortize allocation
let mut buf = String::new();
let f = |s: &'a str| {
buf.clear();
buf.push_str(s);
buf.make_ascii_uppercase();
// extend lifetime
// lifetime is bound to 'a
let slice = buf.as_str();
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
};
ca.apply_mut(f)
}
83 changes: 83 additions & 0 deletions polars/polars-ops/src/chunked_array/strings/justify.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use std::fmt::Write;

use polars_core::prelude::Utf8Chunked;

pub(super) fn ljust<'a>(ca: &'a Utf8Chunked, width: usize, fillchar: char) -> Utf8Chunked {
// amortize allocation
let mut buf = String::new();
let f = |s: &'a str| {
let padding = width.saturating_sub(s.len());
if padding == 0 {
s
} else {
buf.clear();
buf.push_str(s);
for _ in 0..padding {
buf.push(fillchar)
}
// extend lifetime
// lifetime is bound to 'a
let slice = buf.as_str();
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
}
};
ca.apply_mut(f)
}

pub(super) fn rjust<'a>(ca: &'a Utf8Chunked, width: usize, fillchar: char) -> Utf8Chunked {
// amortize allocation
let mut buf = String::new();
let f = |s: &'a str| {
let padding = width.saturating_sub(s.len());
if padding == 0 {
s
} else {
buf.clear();
for _ in 0..padding {
buf.push(fillchar)
}
buf.push_str(s);
// extend lifetime
// lifetime is bound to 'a
let slice = buf.as_str();
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
}
};
ca.apply_mut(f)
}

pub(super) fn zfill<'a>(ca: &'a Utf8Chunked, alignment: usize) -> Utf8Chunked {
// amortize allocation
let mut buf = String::new();
let f = |s: &'a str| {
let alignment = alignment.saturating_sub(s.len());
if alignment == 0 {
return s;
}
buf.clear();
if let Some(stripped) = s.strip_prefix('-') {
write!(
&mut buf,
"-{:0alignment$}{value}",
0,
alignment = alignment,
value = stripped
)
.unwrap();
} else {
write!(
&mut buf,
"{:0alignment$}{value}",
0,
alignment = alignment,
value = s
)
.unwrap();
};
// extend lifetime
// lifetime is bound to 'a
let slice = buf.as_str();
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
};
ca.apply_mut(f)
}
4 changes: 4 additions & 0 deletions polars/polars-ops/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#[cfg(feature = "strings")]
mod case;
#[cfg(feature = "extract_jsonpath")]
mod json_path;
#[cfg(feature = "string_justify")]
mod justify;
#[cfg(feature = "strings")]
mod namespace;
#[cfg(feature = "strings")]
Expand Down
66 changes: 8 additions & 58 deletions polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,77 +109,27 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
/// rather than before.
/// The original string is returned if width is less than or equal to `s.len()`.
#[cfg(feature = "string_justify")]
fn zfill<'a>(&'a self, alignment: usize) -> Utf8Chunked {
fn zfill(&self, alignment: usize) -> Utf8Chunked {
let ca = self.as_utf8();

let f = |s: &'a str| {
let alignment = alignment.saturating_sub(s.len());
if alignment == 0 {
return Cow::Borrowed(s);
}
if let Some(stripped) = s.strip_prefix('-') {
Cow::Owned(format!(
"-{:0alignment$}{value}",
0,
alignment = alignment,
value = stripped
))
} else {
Cow::Owned(format!(
"{:0alignment$}{value}",
0,
alignment = alignment,
value = s
))
}
};
ca.apply(f)
justify::zfill(ca, alignment)
}

/// Return the string left justified in a string of length width.
/// Padding is done using the specified `fillchar`,
/// The original string is returned if width is less than or equal to `s.len()`.
#[cfg(feature = "string_justify")]
fn ljust<'a>(&'a self, width: usize, fillchar: char) -> Utf8Chunked {
fn ljust(&self, width: usize, fillchar: char) -> Utf8Chunked {
let ca = self.as_utf8();

let f = |s: &'a str| {
let padding = width.saturating_sub(s.len());
if padding == 0 {
Cow::Borrowed(s)
} else {
let mut buf = String::with_capacity(width);
buf.push_str(s);
for _ in 0..padding {
buf.push(fillchar)
}
Cow::Owned(buf)
}
};
ca.apply(f)
justify::ljust(ca, width, fillchar)
}

/// Return the string right justified in a string of length width.
/// Padding is done using the specified `fillchar`,
/// The original string is returned if width is less than or equal to `s.len()`.
#[cfg(feature = "string_justify")]
fn rjust<'a>(&'a self, width: usize, fillchar: char) -> Utf8Chunked {
fn rjust(&self, width: usize, fillchar: char) -> Utf8Chunked {
let ca = self.as_utf8();

let f = |s: &'a str| {
let padding = width.saturating_sub(s.len());
if padding == 0 {
Cow::Borrowed(s)
} else {
let mut buf = String::with_capacity(width);
for _ in 0..padding {
buf.push(fillchar)
}
buf.push_str(s);
Cow::Owned(buf)
}
};
ca.apply(f)
justify::rjust(ca, width, fillchar)
}

/// Check if strings contain a regex pattern.
Expand Down Expand Up @@ -414,14 +364,14 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
#[must_use]
fn to_lowercase(&self) -> Utf8Chunked {
let ca = self.as_utf8();
ca.apply(|s| str::to_lowercase(s).into())
case::to_lowercase(ca)
}

/// Modify the strings to their uppercase equivalent
#[must_use]
fn to_uppercase(&self) -> Utf8Chunked {
let ca = self.as_utf8();
ca.apply(|s| str::to_uppercase(s).into())
case::to_uppercase(ca)
}

/// Concat with the values from a second Utf8Chunked
Expand Down

0 comments on commit 9b66ab3

Please sign in to comment.