diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eda5329..55b347b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,6 +35,7 @@ jobs:
     - run: cargo clippy --version
     - run: cargo clippy --features i64
     - run: cargo clippy --all-targets --features i64
+    - run: cargo clippy --all-targets --features i64,i128
     - run: cargo clippy --all-targets --all-features
 
   test:
@@ -44,12 +45,12 @@ jobs:
     - run: rustc --version
     - run: cargo test --features i64
     - run: cargo test --features i128
-    - run: cargo test --no-default-features --lib --features i64
-    - run: cargo test --no-default-features --lib --features i128
-    - run: cargo test --no-default-features --lib --features std,i64
-    - run: cargo test --no-default-features --lib --features serde,i64
-    - run: cargo test --no-default-features --lib --features i64,parity
-    - run: cargo test --no-default-features --lib --features i128,parity
+    - run: cargo test --no-default-features --lib --test it --features i64
+    - run: cargo test --no-default-features --lib --test it --features i128
+    - run: cargo test --no-default-features --lib --test it --features std,i64
+    - run: cargo test --no-default-features --lib --test it --features serde,i64
+    - run: cargo test --no-default-features --lib --test it --features i64,parity
+    - run: cargo test --no-default-features --lib --test it --features i128,parity
     - run: cargo test --all-features
 
   run-example:
diff --git a/Cargo.toml b/Cargo.toml
index 6ddeafb..345f31e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,7 @@ std = ["derive_more/error"]
 i16 = []
 i32 = []
 i64 = []
-i128 = []
+i128 = ["dep:i256"]
 serde = ["dep:serde"]
 schemars = ["dep:schemars"]
 parity = ["parity-scale-codec"]
@@ -46,16 +46,16 @@ quick-xml = ["serde?/derive", "serde?/alloc"] # FIXME: quick-xml#473
 serde = { version = "1.0", default-features = false, optional = true }
 schemars = { version = "0.8", default-features = false, optional = true }
 typenum = "1.12.0"
-derive_more = { version = "0.99.9", default-features = false }
 parity-scale-codec = { version = "3", default-features = false, optional = true }
 static_assertions = "1.1.0"
 itoa = "1.0.1"
+i256 = { version = "=0.1.1", default-features = false, optional = true }
 
 [dev-dependencies]
 anyhow = { version = "1.0.38", default-features = false }
 colored = "2.0.0"
 criterion = "0.5"
-derive_more = "0.99.9"
+derive_more = { version = "1.0.0", features = ["full"] }
 trybuild = "1.0.85"
 serde_json = "1"
 proptest = "1.0.0"
diff --git a/benches/README.md b/benches/README.md
index 2a2b8a4..13ea43d 100644
--- a/benches/README.md
+++ b/benches/README.md
@@ -1,92 +1,92 @@
 # Benchmarks
 
-Benchmarks were performed on an [AMD Ryzen 7 4800HS CPU](https://en.wikichip.org/wiki/amd/ryzen_9/3900).
+Benchmarks were performed on an Intel Core i9-14900K CPU.
 
 ```sh
 $ cargo bench --bench <name> --features <int>
-$ critcmp new | tail +3 | sort | sed 's#        ? ?/sec##'
+$ critcmp new | tail +3 | sort | sed 's#        ? ?/sec##' | sed 's#    1.00##'
 ```
 
 ## ops
 64-bit FP with precision = 9:
 ```
-F64p9/cadd (~1e4)                        1.00      1.9±0.01ns
-F64p9/from_decimal(12345, -3)            1.00      1.6±0.00ns
-F64p9/next_power_of_ten                  1.00      3.6±0.01ns
-F64p9/rdiv (~1e5/~1e4, Ceil)             1.00      1.9±0.01ns
-F64p9/rdiv (~1e5/~1e4, Floor)            1.00      1.9±0.01ns
-F64p9/rdiv (~1e5/~1e4, Nearest)          1.00      1.9±0.00ns
-F64p9/rmul (~1e4, Ceil)                  1.00      1.9±0.01ns
-F64p9/rmul (~1e4, Floor)                 1.00      1.9±0.03ns
-F64p9/rmul (~1e4, Nearest)               1.00      1.9±0.00ns
-F64p9/rsqrt (~1e4, Ceil)                 1.00     43.7±0.29ns
-F64p9/rsqrt (~1e4, Floor)                1.00     42.5±0.17ns
-F64p9/rsqrt (~1e4, Nearest)              1.00     47.0±0.19ns
-F64p9/rsqrt (adaptive, Ceil)             1.00     98.0±0.33ns
-F64p9/rsqrt (adaptive, Floor)            1.00     94.4±1.45ns
-F64p9/rsqrt (adaptive, Nearest)          1.00     99.6±0.67ns
-F64p9/rsqrt (MAX, Ceil)                  1.00    102.3±0.50ns
-F64p9/rsqrt (MAX, Floor)                 1.00    100.2±0.50ns
-F64p9/rsqrt (MAX, Nearest)               1.00    102.7±0.80ns
-F64p9/to_decimal(0) (12.345)             1.00      9.1±0.02ns
-F64p9/to_decimal(i32::MAX) (12.345)      1.00      9.1±0.01ns
-F64p9/try_from(f64) (~0.1)               1.00     64.8±0.33ns
-F64p9/try_from(f64) (~1e-12)             1.00    132.5±0.46ns
-F64p9/try_from(f64) (~1e6)               1.00     24.9±0.14ns
-F64p9/try_from(f64) (MAX)                1.00      5.9±0.01µs
-F64p9/try_from(f64) (MIN_POSITIVE)       1.00   1872.9±4.12ns
+F64p9/cadd (~1e4)                          1.0±0.03ns
+F64p9/from_decimal(12345, -3)              1.0±0.01ns
+F64p9/next_power_of_ten                    1.6±0.03ns
+F64p9/rdiv (~1e5/~1e4, Ceil)               1.0±0.03ns
+F64p9/rdiv (~1e5/~1e4, Floor)              1.0±0.04ns
+F64p9/rdiv (~1e5/~1e4, Nearest)            1.0±0.04ns
+F64p9/rmul (~1e4, Ceil)                    1.0±0.03ns
+F64p9/rmul (~1e4, Floor)                   1.0±0.04ns
+F64p9/rmul (~1e4, Nearest)                 1.0±0.05ns
+F64p9/rsqrt (~1e4, Ceil)                   1.0±0.02ns
+F64p9/rsqrt (~1e4, Floor)                  1.0±0.02ns
+F64p9/rsqrt (~1e4, Nearest)                1.0±0.03ns
+F64p9/rsqrt (adaptive, Ceil)               5.4±0.02ns
+F64p9/rsqrt (adaptive, Floor)              4.9±0.01ns
+F64p9/rsqrt (adaptive, Nearest)            5.5±0.02ns
+F64p9/rsqrt (MAX, Ceil)                    1.0±0.01ns
+F64p9/rsqrt (MAX, Floor)                   1.0±0.01ns
+F64p9/rsqrt (MAX, Nearest)                 1.0±0.01ns
+F64p9/to_decimal(0) (12.345)               5.0±0.01ns
+F64p9/to_decimal(i32::MAX) (12.345)        5.0±0.02ns
+F64p9/try_from(f64) (~0.1)                33.2±0.08ns
+F64p9/try_from(f64) (~1e-12)              61.9±0.20ns
+F64p9/try_from(f64) (~1e6)                16.2±0.05ns
+F64p9/try_from(f64) (MAX)               1263.8±2.26ns
+F64p9/try_from(f64) (MIN_POSITIVE)       693.4±2.38ns
 ```
 
 128-bit FP with precision = 18:
 ```
-F128p18/cadd (~1e4)                      1.00      2.8±0.00ns
-F128p18/from_decimal(12345, -3)          1.00      9.1±0.03ns
-F128p18/next_power_of_ten                1.00      6.3±0.03ns
-F128p18/rdiv (~1e5/~1e4, Ceil)           1.00    157.3±0.51ns
-F128p18/rdiv (~1e5/~1e4, Floor)          1.00    154.2±1.19ns
-F128p18/rdiv (~1e5/~1e4, Nearest)        1.00    159.4±1.05ns
-F128p18/rmul (~1e4, Ceil)                1.00    132.5±0.61ns
-F128p18/rmul (~1e4, Floor)               1.00    132.3±0.79ns
-F128p18/rmul (~1e4, Nearest)             1.00    134.1±0.79ns
-F128p18/rsqrt (~1e4, Ceil)               1.00    428.3±7.08ns
-F128p18/rsqrt (~1e4, Floor)              1.00    403.9±1.24ns
-F128p18/rsqrt (~1e4, Nearest)            1.00    475.3±1.03ns
-F128p18/rsqrt (adaptive, Ceil)           1.00   1469.3±3.05ns
-F128p18/rsqrt (adaptive, Floor)          1.00   1436.2±1.98ns
-F128p18/rsqrt (adaptive, Nearest)        1.00   1530.6±1.97ns
-F128p18/rsqrt (MAX, Ceil)                1.00   1393.2±9.68ns
-F128p18/rsqrt (MAX, Floor)               1.00  1335.9±10.01ns
-F128p18/rsqrt (MAX, Nearest)             1.00  1441.7±11.63ns
-F128p18/to_decimal(0) (12.345)           1.00   263.8±25.35ns
-F128p18/to_decimal(i32::MAX) (12.345)    1.00    263.2±0.13ns
-F128p18/try_from(f64) (~0.1)             1.00     59.3±0.36ns
-F128p18/try_from(f64) (~1e-12)           1.00    133.0±0.14ns
-F128p18/try_from(f64) (~1e6)             1.00     27.8±0.25ns
-F128p18/try_from(f64) (MAX)              1.00      5.9±0.00µs
-F128p18/try_from(f64) (MIN_POSITIVE)     1.00   1842.6±1.86ns
+F128p18/cadd (~1e4)                        1.9±0.05ns
+F128p18/from_decimal(12345, -3)            4.8±0.02ns
+F128p18/next_power_of_ten                  3.1±0.04ns
+F128p18/rdiv (~1e5/~1e4, Ceil)            10.7±0.15ns
+F128p18/rdiv (~1e5/~1e4, Floor)           10.4±0.15ns
+F128p18/rdiv (~1e5/~1e4, Nearest)         11.2±0.16ns
+F128p18/rmul (~1e4, Ceil)                  7.0±0.04ns
+F128p18/rmul (~1e4, Floor)                 7.0±0.02ns
+F128p18/rmul (~1e4, Nearest)               7.2±0.06ns
+F128p18/rsqrt (~1e4, Ceil)                40.0±0.24ns
+F128p18/rsqrt (~1e4, Floor)               39.4±0.28ns
+F128p18/rsqrt (~1e4, Nearest)             41.2±0.28ns
+F128p18/rsqrt (adaptive, Ceil)            50.0±0.42ns
+F128p18/rsqrt (adaptive, Floor)           49.2±0.42ns
+F128p18/rsqrt (adaptive, Nearest)         50.6±0.38ns
+F128p18/rsqrt (MAX, Ceil)                 40.2±0.28ns
+F128p18/rsqrt (MAX, Floor)                39.3±0.27ns
+F128p18/rsqrt (MAX, Nearest)              41.4±0.38ns
+F128p18/to_decimal(0) (12.345)            59.1±0.19ns
+F128p18/to_decimal(i32::MAX) (12.345)     59.1±0.28ns
+F128p18/try_from(f64) (~0.1)              28.5±1.51ns
+F128p18/try_from(f64) (~1e-12)            62.1±0.20ns
+F128p18/try_from(f64) (~1e6)              15.2±0.04ns
+F128p18/try_from(f64) (MAX)             1264.6±4.34ns
+F128p18/try_from(f64) (MIN_POSITIVE)     693.6±2.45ns
 ```
 
 ## serde
 64-bit FP with precision = 9:
 ```
-F64p9/deserialize 123.456 from f64         1.00    103.7±0.24ns
-F64p9/deserialize 123.456 from string      1.00     54.8±0.18ns
-F64p9/deserialize MAX from f64             1.00     59.8±0.24ns
-F64p9/deserialize MAX from string          1.00     86.3±0.79ns
-F64p9/serialize 123.456 to f64             1.00     48.2±0.46ns
-F64p9/serialize 123.456 to string          1.00     27.5±0.29ns
-F64p9/serialize MAX to f64                 1.00     41.3±0.95ns
-F64p9/serialize MAX to string              1.00     35.3±2.63ns
+F64p9/deserialize 123.456 from f64          55.4±0.17ns
+F64p9/deserialize 123.456 from string       27.1±0.34ns
+F64p9/deserialize MAX from f64              44.4±0.03ns
+F64p9/deserialize MAX from string           39.3±0.61ns
+F64p9/serialize 123.456 to f64              27.0±0.33ns
+F64p9/serialize 123.456 to string           13.1±0.21ns
+F64p9/serialize MAX to f64                  38.6±0.01ns
+F64p9/serialize MAX to string               14.8±0.19ns
 ```
 
 128-bit FP with precision = 18:
 ```
-F128p18/deserialize 123.456 from f64       1.00    103.3±0.24ns
-F128p18/deserialize 123.456 from string    1.00     70.8±0.09ns
-F128p18/deserialize MAX from f64           1.00     56.6±0.19ns
-F128p18/deserialize MAX from string        1.00    147.3±0.51ns
-F128p18/serialize 123.456 to f64           1.00     67.7±0.38ns
-F128p18/serialize 123.456 to string        1.00     51.7±0.64ns
-F128p18/serialize MAX to f64               1.00     63.6±0.74ns
-F128p18/serialize MAX to string            1.00     80.6±1.00ns
+F128p18/deserialize 123.456 from f64        55.9±0.07ns
+F128p18/deserialize 123.456 from string     31.5±0.74ns
+F128p18/deserialize MAX from f64            40.8±0.20ns
+F128p18/deserialize MAX from string         60.1±0.75ns
+F128p18/serialize 123.456 to f64            30.4±0.15ns
+F128p18/serialize 123.456 to string         23.6±0.29ns
+F128p18/serialize MAX to f64                23.4±0.02ns
+F128p18/serialize MAX to string             37.3±0.04ns
 ```
diff --git a/src/errors.rs b/src/errors.rs
index 6d63cf4..ca487eb 100644
--- a/src/errors.rs
+++ b/src/errors.rs
@@ -1,10 +1,11 @@
 use core::fmt::{Display, Formatter, Result};
 
+// TODO: once MSRV becomes 1.81, use `core::error::Error` instead.
+// Also, enable doctests in CI checks even for no-std.
 #[cfg(feature = "std")]
-use derive_more::Error;
+use std::error::Error;
 
 /// Represents errors during arithmetic operations.
-#[cfg_attr(feature = "std", derive(Error))]
 #[derive(Clone, Debug, PartialEq, Eq)]
 #[non_exhaustive]
 pub enum ArithmeticError {
@@ -34,8 +35,10 @@ impl Display for ArithmeticError {
     }
 }
 
+#[cfg(feature = "std")]
+impl Error for ArithmeticError {}
+
 /// Represents errors during conversions.
-#[cfg_attr(feature = "std", derive(Error))]
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct ConvertError {
     reason: &'static str,
@@ -57,3 +60,6 @@ impl Display for ConvertError {
         f.write_str(self.as_str())
     }
 }
+
+#[cfg(feature = "std")]
+impl Error for ConvertError {}
diff --git a/src/i256/mod.rs b/src/i256/mod.rs
deleted file mode 100644
index cf55741..0000000
--- a/src/i256/mod.rs
+++ /dev/null
@@ -1,336 +0,0 @@
-use core::cmp::{Ordering, PartialOrd};
-use core::ops::{Add, Div, Mul, Neg, Sub};
-
-use crate::ops::sqrt::Sqrt;
-use crate::ops::{One, Zero};
-use crate::{ArithmeticError, ConvertError};
-
-const TOTAL_BITS_COUNT: usize = 256;
-const UINT_CHUNK_BITS_COUNT: usize = 64;
-const UINT_CHUNKS_COUNT: usize = TOTAL_BITS_COUNT / UINT_CHUNK_BITS_COUNT;
-const SIGN_MASK: u64 = 1 << (UINT_CHUNK_BITS_COUNT - 1); // MSB = 1, other are equal to 0.
-
-mod u256;
-
-use u256::U256;
-
-/// Signed 256-bit number. Works on top of U256 with help of two's complement.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
-pub struct I256 {
-    inner: U256,
-}
-
-impl I256 {
-    pub const I128_MAX: Self = Self::from_i128(i128::MAX);
-    pub const I128_MIN: Self = Self::from_i128(i128::MIN);
-    pub const U128_MAX: Self = Self::new(U256([u64::MAX, u64::MAX, 0, 0]));
-    pub const MAX: Self = Self::new(U256([u64::MAX, u64::MAX, u64::MAX, !SIGN_MASK]));
-    pub const MIN: Self = Self::new(U256([0, 0, 0, SIGN_MASK]));
-
-    const fn new(x: U256) -> Self {
-        I256 { inner: x }
-    }
-
-    pub const fn from_i128(x: i128) -> Self {
-        let msb = if x < 0 { u64::MAX } else { 0 };
-        Self::new(U256([x as u64, (x >> 64) as u64, msb, msb])) // The only way to do it const
-    }
-
-    const fn is_negative(self) -> bool {
-        let most_significant_chunk: u64 = self.chunks()[UINT_CHUNKS_COUNT - 1];
-        most_significant_chunk & SIGN_MASK != 0
-    }
-
-    const fn chunks(&self) -> &[u64; UINT_CHUNKS_COUNT] {
-        &self.inner.0
-    }
-}
-
-impl Mul for I256 {
-    type Output = Self;
-
-    #[inline]
-    fn mul(self, rhs: Self) -> Self::Output {
-        let lhs_was_negative = self.is_negative();
-        let rhs_was_negative = rhs.is_negative();
-
-        let lhs = if lhs_was_negative { -self } else { self };
-        let rhs = if rhs_was_negative { -rhs } else { rhs };
-
-        // Mustn't overflow because we're usually promoting just i128 to I256.
-        let result = Self::new(lhs.inner * rhs.inner);
-        if lhs_was_negative == rhs_was_negative {
-            result
-        } else {
-            -result
-        }
-    }
-}
-
-impl Div for I256 {
-    type Output = Self;
-
-    #[inline]
-    fn div(self, rhs: Self) -> Self::Output {
-        let lhs_was_negative = self.is_negative();
-        let rhs_was_negative = rhs.is_negative();
-
-        let lhs = if lhs_was_negative { -self } else { self };
-        let rhs = if rhs_was_negative { -rhs } else { rhs };
-
-        let result = Self::new(lhs.inner / rhs.inner);
-        if lhs_was_negative == rhs_was_negative {
-            result
-        } else {
-            -result
-        }
-    }
-}
-
-impl Add for I256 {
-    type Output = Self;
-
-    #[inline]
-    fn add(self, rhs: Self) -> Self::Output {
-        let (x, _) = self.inner.overflowing_add(rhs.inner);
-        Self::new(x)
-    }
-}
-
-impl Sub for I256 {
-    type Output = Self;
-
-    #[inline]
-    fn sub(self, rhs: Self) -> Self::Output {
-        let (x, _) = self.inner.overflowing_sub(rhs.inner);
-        Self::new(x)
-    }
-}
-
-impl Neg for I256 {
-    type Output = Self;
-
-    #[inline]
-    fn neg(self) -> Self::Output {
-        // Neg isn't defined for `I256::MIN` because on two's complement we always have one extra negative value.
-        debug_assert_ne!(self, Self::MIN);
-        // Overflow takes place when we negate zero.
-        let (x, _) = (!self.inner).overflowing_add(Self::ONE.inner);
-        Self::new(x)
-    }
-}
-
-impl Ord for I256 {
-    #[inline]
-    fn cmp(&self, other: &Self) -> Ordering {
-        match (self.is_negative(), other.is_negative()) {
-            (true, false) => Ordering::Less,
-            (false, true) => Ordering::Greater,
-            _ => self.inner.cmp(&other.inner),
-        }
-    }
-}
-
-impl PartialOrd for I256 {
-    #[inline]
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl From<i128> for I256 {
-    fn from(x: i128) -> Self {
-        Self::from_i128(x)
-    }
-}
-
-impl TryFrom<I256> for i128 {
-    type Error = ArithmeticError;
-
-    fn try_from(x: I256) -> Result<Self, Self::Error> {
-        if x > I256::I128_MAX || x < I256::I128_MIN {
-            return Err(ArithmeticError::Overflow);
-        }
-        Ok(i128::from(x.chunks()[0]) | (i128::from(x.chunks()[1]) << 64))
-    }
-}
-
-impl From<u128> for I256 {
-    fn from(x: u128) -> Self {
-        Self::new(x.into())
-    }
-}
-
-impl TryFrom<I256> for u128 {
-    type Error = ConvertError;
-
-    fn try_from(x: I256) -> Result<Self, Self::Error> {
-        if x > I256::U128_MAX || x < I256::ZERO {
-            return Err(ConvertError::new("too big integer"));
-        }
-        Ok(u128::from(x.chunks()[0]) | (u128::from(x.chunks()[1]) << 64))
-    }
-}
-
-impl One for I256 {
-    const ONE: Self = Self::from_i128(1);
-}
-
-impl Zero for I256 {
-    const ZERO: Self = Self::from_i128(0);
-}
-
-impl Sqrt for I256 {
-    type Error = ArithmeticError;
-
-    #[inline]
-    fn sqrt(self) -> Result<Self, Self::Error> {
-        debug_assert!(self >= Self::ZERO);
-        self.inner.sqrt().map(Self::new)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn min() {
-        assert_eq!(i128::try_from(I256::I128_MIN).unwrap(), i128::MIN);
-    }
-
-    #[test]
-    fn max() {
-        assert_eq!(i128::try_from(I256::I128_MAX).unwrap(), i128::MAX);
-    }
-
-    #[test]
-    fn cmp() {
-        use core::cmp::Ordering::{self, *};
-        fn t(a: i128, b: i128, ord: Ordering) {
-            let a = I256::from(a);
-            let b = I256::from(b);
-            assert_eq!(a.cmp(&b), ord);
-            assert_eq!(b.cmp(&a), ord.reverse());
-        }
-        t(5, 3, Greater);
-        t(-5, -5, Equal);
-        t(0, -5, Greater);
-    }
-
-    #[test]
-    fn from_i128() {
-        fn t(x: i128) {
-            assert_eq!(i128::try_from(I256::from(x)).unwrap(), x);
-        }
-        t(0);
-        t(1);
-        t(-1);
-        t(i128::MAX);
-        t(i128::MAX - 1);
-        t(i128::MIN);
-        t(i128::MIN + 1);
-    }
-
-    #[test]
-    fn neg_i128() {
-        fn t(x: i128) {
-            assert_eq!(i128::try_from(-I256::from(x)).unwrap(), -x);
-            assert_eq!(i128::try_from(-I256::from(-x)).unwrap(), x);
-        }
-        t(0);
-        t(1);
-        t(1234);
-        t(123_456_789_987);
-    }
-
-    #[test]
-    fn neg_i256() {
-        fn t(value: I256, expected: I256) {
-            let actual: I256 = -value;
-            assert_eq!(actual, expected);
-            assert_eq!(-actual, value);
-        }
-        t(I256::MAX, I256::new(U256([1, 0, 0, SIGN_MASK])));
-        t(
-            I256::new(U256([
-                0xa869_bc02_ecba_4436,
-                0x5ef3_b3e7_5daa_96ce,
-                0x369a_22b0_7ff5_955b,
-                0x8aa9_fa9e_77c4_2900,
-            ])),
-            I256::new(U256([
-                0x579643fd1345bbca,
-                0xa10c4c18a2556931,
-                0xc965dd4f800a6aa4,
-                0x75560561883bd6ff,
-            ])),
-        );
-    }
-
-    #[test]
-    #[should_panic]
-    fn neg_i256_min() {
-        let _x = -I256::MIN;
-    }
-
-    #[test]
-    fn add() {
-        fn t(a: i128, b: i128, expected: i128) {
-            let a = I256::from(a);
-            let b = I256::from(b);
-            assert_eq!(i128::try_from(a + b).unwrap(), expected);
-            assert_eq!(i128::try_from(b + a).unwrap(), expected);
-            assert_eq!(i128::try_from((-a) + (-b)).unwrap(), -expected);
-            assert_eq!(i128::try_from((-b) + (-a)).unwrap(), -expected);
-        }
-        t(0, 0, 0);
-        t(1111, 3210, 4321);
-        t(-1111, 5432, 4321);
-        t(-4321, 5432, 1111);
-    }
-
-    #[test]
-    fn sub() {
-        fn t(a: i128, b: i128, expected: i128) {
-            let a = I256::from(a);
-            let b = I256::from(b);
-            assert_eq!(i128::try_from(a - b).unwrap(), expected);
-            assert_eq!(i128::try_from(b - a).unwrap(), -expected);
-            assert_eq!(i128::try_from((-a) - (-b)).unwrap(), -expected);
-            assert_eq!(i128::try_from((-b) - (-a)).unwrap(), expected);
-        }
-        t(0, 0, 0);
-        t(4321, 1111, 3210);
-        t(4321, -1111, 5432);
-        t(1111, -4321, 5432);
-    }
-
-    #[test]
-    fn mul() {
-        fn t(a: i128, b: i128, expected: i128) {
-            let a = I256::from(a);
-            let b = I256::from(b);
-            assert_eq!(i128::try_from(a * b).unwrap(), expected);
-            assert_eq!(i128::try_from(b * a).unwrap(), expected);
-            assert_eq!(i128::try_from((-a) * (-b)).unwrap(), expected);
-            assert_eq!(i128::try_from((-b) * (-a)).unwrap(), expected);
-        }
-        t(0, 0, 0);
-        t(7, 5, 35);
-        t(-7, 5, -35);
-    }
-
-    #[test]
-    fn div() {
-        fn t(a: i128, b: i128, expected: i128) {
-            let a = I256::from(a);
-            let b = I256::from(b);
-            assert_eq!(i128::try_from(a / b).unwrap(), expected);
-            assert_eq!(i128::try_from((-a) / (-b)).unwrap(), expected);
-        }
-        t(0, 1, 0);
-        t(35, 5, 7);
-        t(-35, 5, -7);
-    }
-}
diff --git a/src/i256/u256.rs b/src/i256/u256.rs
deleted file mode 100644
index 9b30cf5..0000000
--- a/src/i256/u256.rs
+++ /dev/null
@@ -1,869 +0,0 @@
-//! # `U256`
-//!
-//! Expanded unsigned 256-bit integer.
-//!
-//! Implementation courtesy of [`uint` crate](https://crates.io/crates/uint).
-
-use crate::errors::{ArithmeticError, ConvertError};
-use crate::ops::sqrt::Sqrt;
-use crate::ops::Zero;
-
-macro_rules! impl_map_from {
-    ($thing:ident, $from:ty, $to:ty) => {
-        impl From<$from> for $thing {
-            fn from(value: $from) -> $thing {
-                From::from(value as $to)
-            }
-        }
-    };
-}
-
-macro_rules! uint_overflowing_binop {
-    ($name:ident, $n_words: tt, $self_expr: expr, $other: expr, $fn:expr) => {{
-        let $name(ref me) = $self_expr;
-        let $name(ref you) = $other;
-
-        let mut ret = [0u64; $n_words];
-        let ret_ptr = &mut ret as *mut [u64; $n_words] as *mut u64;
-        let mut carry = 0u64;
-
-        uint! { @unroll
-            for i in 0..$n_words {
-                if carry != 0 {
-                    let (res1, overflow1) = ($fn)(me[i], you[i]);
-                    let (res2, overflow2) = ($fn)(res1, carry);
-
-                    unsafe {
-                        // SAFETY: `i` is within bounds and `i * size_of::<u64>() < isize::MAX`
-                        #![allow(clippy::ptr_offset_with_cast)]
-                        *ret_ptr.offset(i as _) = res2
-                    }
-                    carry = (overflow1 as u8 + overflow2 as u8) as u64;
-                } else {
-                    let (res, overflow) = ($fn)(me[i], you[i]);
-
-                    unsafe {
-                        // SAFETY: `i` is within bounds and `i * size_of::<u64>() < isize::MAX`
-                        #![allow(clippy::ptr_offset_with_cast)]
-                        *ret_ptr.offset(i as _) = res
-                    }
-
-                    carry = overflow as u64;
-                }
-            }
-        }
-
-        ($name(ret), carry > 0)
-    }};
-}
-
-macro_rules! uint_full_mul_reg {
-    ($name:ident, 8, $self_expr:expr, $other:expr) => {
-        $crate::uint_full_mul_reg!($name, 8, $self_expr, $other, |a, b| a != 0 || b != 0);
-    };
-    ($name:ident, $n_words:tt, $self_expr:expr, $other:expr) => {
-        uint_full_mul_reg!($name, $n_words, $self_expr, $other, |_, _| true)
-    };
-    ($name:ident, $n_words:tt, $self_expr:expr, $other:expr, $check:expr) => {{
-        {
-            #![allow(unused_assignments)]
-
-            let $name(ref me) = $self_expr;
-            let $name(ref you) = $other;
-            let mut ret = [0u64; $n_words * 2];
-
-            uint! { @unroll
-                for i in 0..$n_words {
-                    let mut carry = 0u64;
-                    let b = you[i];
-
-                    uint! { @unroll
-                        for j in 0..$n_words {
-                            #[allow(clippy::redundant_closure_call)]
-                            if $check(me[j], carry) {
-                                let a = me[j];
-
-                                let (hi, low) = Self::split_u128(a as u128 * b as u128);
-
-                                let overflow = {
-                                    let existing_low = &mut ret[i + j];
-                                    let (low, o) = low.overflowing_add(*existing_low);
-                                    *existing_low = low;
-                                    o
-                                };
-
-                                carry = {
-                                    let existing_hi = &mut ret[i + j + 1];
-                                    let hi = hi + overflow as u64;
-                                    let (hi, o0) = hi.overflowing_add(carry);
-                                    let (hi, o1) = hi.overflowing_add(*existing_hi);
-                                    *existing_hi = hi;
-
-                                    (o0 | o1) as u64
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            ret
-        }
-    }};
-}
-
-macro_rules! uint_overflowing_mul {
-    ($name:ident, $n_words: tt, $self_expr: expr, $other: expr) => {{
-        let ret: [u64; $n_words * 2] = uint_full_mul_reg!($name, $n_words, $self_expr, $other);
-
-        // The safety of this is enforced by the compiler
-        let ret: [[u64; $n_words]; 2] = unsafe { core::mem::transmute(ret) };
-
-        // The compiler WILL NOT inline this if you remove this annotation.
-        #[inline(always)]
-        fn any_nonzero(arr: &[u64; $n_words]) -> bool {
-            uint! { @unroll
-                for i in 0..$n_words {
-                    if arr[i] != 0 {
-                        return true;
-                    }
-                }
-            }
-
-            false
-        }
-
-        ($name(ret[0]), any_nonzero(&ret[1]))
-    }};
-}
-
-fn panic_on_overflow(flag: bool) {
-    if flag {
-        panic!("arithmetic operation overflow")
-    }
-}
-
-macro_rules! impl_mul_from {
-    ($name: ty, $other: ident) => {
-        impl core::ops::Mul<$other> for $name {
-            type Output = $name;
-
-            fn mul(self, other: $other) -> $name {
-                let bignum: $name = other.into();
-                let (result, overflow) = self.overflowing_mul(bignum);
-                panic_on_overflow(overflow);
-                result
-            }
-        }
-
-        impl<'a> core::ops::Mul<&'a $other> for $name {
-            type Output = $name;
-
-            fn mul(self, other: &'a $other) -> $name {
-                let bignum: $name = (*other).into();
-                let (result, overflow) = self.overflowing_mul(bignum);
-                panic_on_overflow(overflow);
-                result
-            }
-        }
-
-        impl<'a> core::ops::Mul<&'a $other> for &'a $name {
-            type Output = $name;
-
-            fn mul(self, other: &'a $other) -> $name {
-                let bignum: $name = (*other).into();
-                let (result, overflow) = self.overflowing_mul(bignum);
-                panic_on_overflow(overflow);
-                result
-            }
-        }
-
-        impl<'a> core::ops::Mul<$other> for &'a $name {
-            type Output = $name;
-
-            fn mul(self, other: $other) -> $name {
-                let bignum: $name = other.into();
-                let (result, overflow) = self.overflowing_mul(bignum);
-                panic_on_overflow(overflow);
-                result
-            }
-        }
-
-        impl core::ops::MulAssign<$other> for $name {
-            fn mul_assign(&mut self, other: $other) {
-                let result = *self * other;
-                *self = result
-            }
-        }
-    };
-}
-
-macro_rules! impl_mul_for_primitive {
-    ($name: ty, $other: ident) => {
-        impl core::ops::Mul<$other> for $name {
-            type Output = $name;
-
-            fn mul(self, other: $other) -> $name {
-                let (result, carry) = self.overflowing_mul_u64(other as u64);
-                panic_on_overflow(carry > 0);
-                result
-            }
-        }
-
-        impl<'a> core::ops::Mul<&'a $other> for $name {
-            type Output = $name;
-
-            fn mul(self, other: &'a $other) -> $name {
-                let (result, carry) = self.overflowing_mul_u64(*other as u64);
-                panic_on_overflow(carry > 0);
-                result
-            }
-        }
-
-        impl<'a> core::ops::Mul<&'a $other> for &'a $name {
-            type Output = $name;
-
-            fn mul(self, other: &'a $other) -> $name {
-                let (result, carry) = self.overflowing_mul_u64(*other as u64);
-                panic_on_overflow(carry > 0);
-                result
-            }
-        }
-
-        impl<'a> core::ops::Mul<$other> for &'a $name {
-            type Output = $name;
-
-            fn mul(self, other: $other) -> $name {
-                let (result, carry) = self.overflowing_mul_u64(other as u64);
-                panic_on_overflow(carry > 0);
-                result
-            }
-        }
-
-        impl core::ops::MulAssign<$other> for $name {
-            fn mul_assign(&mut self, other: $other) {
-                let result = *self * (other as u64);
-                *self = result
-            }
-        }
-    };
-}
-
-macro_rules! uint {
-    ( $(#[$attr:meta])* $visibility:vis struct $name:ident (1); ) => {
-        uint!{ @construct $(#[$attr])* $visibility struct $name (1); }
-    };
-
-    ( $(#[$attr:meta])* $visibility:vis struct $name:ident ( $n_words:tt ); ) => {
-        uint! { @construct $(#[$attr])* $visibility struct $name ($n_words); }
-    };
-    ( @construct $(#[$attr:meta])* $visibility:vis struct $name:ident ( $n_words:tt ); ) => {
-        /// Little-endian large integer type
-        #[repr(C)]
-        $(#[$attr])*
-        #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
-        $visibility struct $name (pub(crate) [u64; $n_words]);
-
-        /// Get a reference to the underlying little-endian words.
-        impl AsRef<[u64]> for $name {
-            #[inline]
-            fn as_ref(&self) -> &[u64] {
-                &self.0
-            }
-        }
-
-        impl $name {
-            const WORD_BITS: usize = 64;
-
-            /// Low word (u64)
-            #[inline]
-            const fn low_u64(&self) -> u64 {
-                let &$name(ref arr) = self;
-                arr[0]
-            }
-
-            /// Conversion to usize with overflow checking
-            ///
-            /// # Panics
-            ///
-            /// Panics if the number is larger than usize::max_value().
-            #[inline]
-            fn as_usize(&self) -> usize {
-                let &$name(ref arr) = self;
-                if !self.fits_word() || arr[0] > usize::max_value() as u64 {
-                    panic!("Integer overflow when casting to usize")
-                }
-                arr[0] as usize
-            }
-
-            // Whether this fits u64.
-            #[inline]
-            fn fits_word(&self) -> bool {
-                let &$name(ref arr) = self;
-                for i in 1..$n_words { if arr[i] != 0 { return false; } }
-                return true;
-            }
-
-            /// Return the least number of bits needed to represent the number
-            #[inline]
-            fn bits(&self) -> usize {
-                let &$name(ref arr) = self;
-                for i in 1..$n_words {
-                    if arr[$n_words - i] > 0 { return (0x40 * ($n_words - i + 1)) - arr[$n_words - i].leading_zeros() as usize; }
-                }
-                0x40 - arr[0].leading_zeros() as usize
-            }
-
-            /// Zero (additive identity) of this type.
-            #[inline]
-            const fn zero() -> Self {
-                Self([0; $n_words])
-            }
-
-            fn full_shl(self, shift: u32) -> [u64; $n_words + 1] {
-                debug_assert!(shift < Self::WORD_BITS as u32);
-                let mut u = [0u64; $n_words + 1];
-                let u_lo = self.0[0] << shift;
-                let u_hi = self >> (Self::WORD_BITS as u32 - shift);
-                u[0] = u_lo;
-                u[1..].copy_from_slice(&u_hi.0[..]);
-                u
-            }
-
-            fn full_shr(u: [u64; $n_words + 1], shift: u32) -> Self {
-                debug_assert!(shift < Self::WORD_BITS as u32);
-                let mut res = Self::zero();
-                for i in 0..$n_words {
-                    res.0[i] = u[i] >> shift;
-                }
-                // carry
-                if shift > 0 {
-                    for i in 1..=$n_words {
-                        res.0[i - 1] |= u[i] << (Self::WORD_BITS as u32 - shift);
-                    }
-                }
-                res
-            }
-
-            fn full_mul_u64(self, by: u64) -> [u64; $n_words + 1] {
-                let (prod, carry) = self.overflowing_mul_u64(by);
-                let mut res = [0u64; $n_words + 1];
-                res[..$n_words].copy_from_slice(&prod.0[..]);
-                res[$n_words] = carry;
-                res
-            }
-
-            fn div_mod_small(mut self, other: u64) -> (Self, Self) {
-                let mut rem = 0u64;
-                self.0.iter_mut().rev().for_each(|d| {
-                    let (q, r) = Self::div_mod_word(rem, *d, other);
-                    *d = q;
-                    rem = r;
-                });
-                (self, rem.into())
-            }
-
-            // See Knuth, TAOCP, Volume 2, section 4.3.1, Algorithm D.
-            fn div_mod_knuth(self, mut v: Self, n: usize, m: usize) -> (Self, Self) {
-                debug_assert!(self.bits() >= v.bits() && !v.fits_word());
-                debug_assert!(n + m <= $n_words);
-                // D1.
-                // Make sure 64th bit in v's highest word is set.
-                // If we shift both self and v, it won't affect the quotient
-                // and the remainder will only need to be shifted back.
-                let shift = v.0[n - 1].leading_zeros();
-                v <<= shift;
-                // u will store the remainder (shifted)
-                let mut u = self.full_shl(shift);
-
-                // quotient
-                let mut q = Self::zero();
-                let v_n_1 = v.0[n - 1];
-                let v_n_2 = v.0[n - 2];
-
-                // D2. D7.
-                // iterate from m downto 0
-                for j in (0..=m).rev() {
-                    let u_jn = u[j + n];
-
-                    // D3.
-                    // q_hat is our guess for the j-th quotient digit
-                    // q_hat = min(b - 1, (u_{j+n} * b + u_{j+n-1}) / v_{n-1})
-                    // b = 1 << WORD_BITS
-                    // Theorem B: q_hat >= q_j >= q_hat - 2
-                    let mut q_hat = if u_jn < v_n_1 {
-                        let (mut q_hat, mut r_hat) = Self::div_mod_word(u_jn, u[j + n - 1], v_n_1);
-                        // this loop takes at most 2 iterations
-                        loop {
-                            // check if q_hat * v_{n-2} > b * r_hat + u_{j+n-2}
-                            let (hi, lo) = Self::split_u128(u128::from(q_hat) * u128::from(v_n_2));
-                            if (hi, lo) <= (r_hat, u[j + n - 2]) {
-                                break;
-                            }
-                            // then iterate till it doesn't hold
-                            q_hat -= 1;
-                            let (new_r_hat, overflow) = r_hat.overflowing_add(v_n_1);
-                            r_hat = new_r_hat;
-                            // if r_hat overflowed, we're done
-                            if overflow {
-                                break;
-                            }
-                        }
-                        q_hat
-                    } else {
-                        // here q_hat >= q_j >= q_hat - 1
-                        u64::max_value()
-                    };
-
-                    // ex. 20:
-                    // since q_hat * v_{n-2} <= b * r_hat + u_{j+n-2},
-                    // either q_hat == q_j, or q_hat == q_j + 1
-
-                    // D4.
-                    // let's assume optimistically q_hat == q_j
-                    // subtract (q_hat * v) from u[j..]
-                    let q_hat_v = v.full_mul_u64(q_hat);
-                    // u[j..] -= q_hat_v;
-                    let c = Self::sub_slice(&mut u[j..], &q_hat_v[..n + 1]);
-
-                    // D6.
-                    // actually, q_hat == q_j + 1 and u[j..] has overflowed
-                    // highly unlikely ~ (1 / 2^63)
-                    if c {
-                        q_hat -= 1;
-                        // add v to u[j..]
-                        let c = Self::add_slice(&mut u[j..], &v.0[..n]);
-                        u[j + n] = u[j + n].wrapping_add(u64::from(c));
-                    }
-
-                    // D5.
-                    q.0[j] = q_hat;
-                }
-
-                // D8.
-                let remainder = Self::full_shr(u, shift);
-
-                (q, remainder)
-            }
-
-            // Returns the least number of words needed to represent the nonzero number
-            fn words(bits: usize) -> usize {
-                debug_assert!(bits > 0);
-                1 + (bits - 1) / Self::WORD_BITS
-            }
-
-            /// Returns a pair `(self / other, self % other)`.
-            ///
-            /// # Panics
-            ///
-            /// Panics if `other` is zero.
-            fn div_mod(self, other: Self) -> (Self, Self) {
-                let my_bits = self.bits();
-                let your_bits = other.bits();
-
-                assert!(your_bits != 0, "division by zero");
-
-                // Early return in case we are dividing by a larger number than us
-                if my_bits < your_bits {
-                    return (Self::zero(), self);
-                }
-
-                if your_bits <= Self::WORD_BITS {
-                    return self.div_mod_small(other.low_u64());
-                }
-
-                let (n, m) = {
-                    let my_words = Self::words(my_bits);
-                    let your_words = Self::words(your_bits);
-                    (your_words, my_words - your_words)
-                };
-
-                self.div_mod_knuth(other, n, m)
-            }
-
-            /// Add with overflow.
-            #[inline(always)]
-            pub(crate) fn overflowing_add(self, other: $name) -> ($name, bool) {
-                uint_overflowing_binop!(
-                    $name,
-                    $n_words,
-                    self,
-                    other,
-                    u64::overflowing_add
-                )
-            }
-
-            /// Subtraction which underflows and returns a flag if it does.
-            #[inline(always)]
-            pub(crate) fn overflowing_sub(self, other: $name) -> ($name, bool) {
-                uint_overflowing_binop!(
-                    $name,
-                    $n_words,
-                    self,
-                    other,
-                    u64::overflowing_sub
-                )
-            }
-
-            /// Multiply with overflow, returning a flag if it does.
-            #[inline(always)]
-            pub(crate) fn overflowing_mul(self, other: $name) -> ($name, bool) {
-                uint_overflowing_mul!($name, $n_words, self, other)
-            }
-
-            #[inline(always)]
-            fn div_mod_word(hi: u64, lo: u64, y: u64) -> (u64, u64) {
-                debug_assert!(hi < y);
-                // NOTE: this is slow (__udivti3)
-                // let x = (u128::from(hi) << 64) + u128::from(lo);
-                // let d = u128::from(d);
-                // ((x / d) as u64, (x % d) as u64)
-                // TODO: look at https://gmplib.org/~tege/division-paper.pdf
-                const TWO32: u64 = 1 << 32;
-                let s = y.leading_zeros();
-                let y = y << s;
-                let (yn1, yn0) = Self::split(y);
-                let un32 = (hi << s) | lo.checked_shr(64 - s).unwrap_or(0);
-                let un10 = lo << s;
-                let (un1, un0) = Self::split(un10);
-                let mut q1 = un32 / yn1;
-                let mut rhat = un32 - q1 * yn1;
-
-                while q1 >= TWO32 || q1 * yn0 > TWO32 * rhat + un1 {
-                    q1 -= 1;
-                    rhat += yn1;
-                    if rhat >= TWO32 {
-                        break;
-                    }
-                }
-
-                let un21 = un32.wrapping_mul(TWO32).wrapping_add(un1).wrapping_sub(q1.wrapping_mul(y));
-                let mut q0 = un21 / yn1;
-                rhat = un21.wrapping_sub(q0.wrapping_mul(yn1));
-
-                while q0 >= TWO32 || q0 * yn0 > TWO32 * rhat + un0 {
-                    q0 -= 1;
-                    rhat += yn1;
-                    if rhat >= TWO32 {
-                        break;
-                    }
-                }
-
-                let rem = un21.wrapping_mul(TWO32).wrapping_add(un0).wrapping_sub(y.wrapping_mul(q0));
-                (q1 * TWO32 + q0, rem >> s)
-            }
-
-            #[inline(always)]
-            fn add_slice(a: &mut [u64], b: &[u64]) -> bool {
-                Self::binop_slice(a, b, u64::overflowing_add)
-            }
-
-            #[inline(always)]
-            fn sub_slice(a: &mut [u64], b: &[u64]) -> bool {
-                Self::binop_slice(a, b, u64::overflowing_sub)
-            }
-
-            #[inline(always)]
-            fn binop_slice(a: &mut [u64], b: &[u64], binop: impl Fn(u64, u64) -> (u64, bool) + Copy) -> bool {
-                let mut c = false;
-                a.iter_mut().zip(b.iter()).for_each(|(x, y)| {
-                    let (res, carry) = Self::binop_carry(*x, *y, c, binop);
-                    *x = res;
-                    c = carry;
-                });
-                c
-            }
-
-            #[inline(always)]
-            fn binop_carry(a: u64, b: u64, c: bool, binop: impl Fn(u64, u64) -> (u64, bool)) -> (u64, bool) {
-                let (res1, overflow1) = b.overflowing_add(u64::from(c));
-                let (res2, overflow2) = binop(a, res1);
-                (res2, overflow1 || overflow2)
-            }
-
-            #[inline(always)]
-            const fn mul_u64(a: u64, b: u64, carry: u64) -> (u64, u64) {
-                let (hi, lo) = Self::split_u128(a as u128 * b as u128 + carry as u128);
-                (lo, hi)
-            }
-
-            #[inline(always)]
-            const fn split(a: u64) -> (u64, u64) {
-                (a >> 32, a & 0xFFFF_FFFF)
-            }
-
-            #[inline(always)]
-            const fn split_u128(a: u128) -> (u64, u64) {
-                ((a >> 64) as _, (a & 0xFFFFFFFFFFFFFFFF) as _)
-            }
-
-            /// Overflowing multiplication by u64.
-            /// Returns the result and carry.
-            fn overflowing_mul_u64(mut self, other: u64) -> (Self, u64) {
-                let mut carry = 0u64;
-
-                for d in self.0.iter_mut() {
-                    let (res, c) = Self::mul_u64(*d, other, carry);
-                    *d = res;
-                    carry = c;
-                }
-
-                (self, carry)
-            }
-
-            fn leading_zeros(&self) -> u32 {
-                self.0.iter().rev().fold((0, false), |(acc, one_was_met), &chunk| {
-                    if one_was_met {
-                        (acc, true)
-                    } else {
-                        (acc + chunk.leading_zeros(), chunk != 0)
-                    }
-                }).0
-            }
-        }
-
-        impl core::convert::From<u64> for $name {
-            fn from(value: u64) -> $name {
-                let mut ret = [0; $n_words];
-                ret[0] = value;
-                $name(ret)
-            }
-        }
-
-        impl core::convert::TryFrom<$name> for u128 {
-            type Error = ConvertError;
-
-            fn try_from(value: $name) -> Result<Self, Self::Error> {
-                if $n_words * $name::WORD_BITS as u32 - value.leading_zeros() > 128 {
-                    return Err(ConvertError::new("too big integer"));
-                }
-                let ret = (value.0[0] as u128) | ((value.0[1] as u128) << $name::WORD_BITS as u32);
-                Ok(ret)
-            }
-        }
-
-        impl core::convert::From<u128> for $name {
-            fn from(value: u128) -> Self {
-                let mut ret = [0u64; $n_words];
-                ret[0] = value as _ ;
-                ret[1] = (value >> 64) as _;
-                $name(ret)
-            }
-        }
-
-        impl_map_from!($name, u32, u64);
-
-        impl core::convert::From<i64> for $name {
-            fn from(value: i64) -> $name {
-                match value >= 0 {
-                    true => From::from(value as u64),
-                    false => { panic!("Unsigned integer can't be created from negative value"); }
-                }
-            }
-        }
-
-        // all other impls
-        impl_mul_from!($name, $name);
-        impl_mul_for_primitive!($name, u64);
-        impl_mul_for_primitive!($name, usize);
-
-        impl<T> core::ops::Div<T> for $name where T: Into<$name> {
-            type Output = $name;
-
-            fn div(self, other: T) -> $name {
-                let other: Self = other.into();
-                self.div_mod(other).0
-            }
-        }
-
-        impl<'a, T> core::ops::Div<T> for &'a $name where T: Into<$name> {
-            type Output = $name;
-
-            fn div(self, other: T) -> $name {
-                *self / other
-            }
-        }
-
-        impl<T> core::ops::DivAssign<T> for $name where T: Into<$name> {
-            fn div_assign(&mut self, other: T) {
-                *self = *self / other.into();
-            }
-        }
-
-        impl core::ops::Not for $name {
-            type Output = $name;
-
-            #[inline]
-            fn not(self) -> $name {
-                let $name(ref arr) = self;
-                let mut ret = [0u64; $n_words];
-                for i in 0..$n_words {
-                    ret[i] = !arr[i];
-                }
-                $name(ret)
-            }
-        }
-
-        impl<T> core::ops::Shl<T> for $name where T: Into<$name> {
-            type Output = $name;
-
-            fn shl(self, shift: T) -> $name {
-                let shift = shift.into().as_usize();
-                let $name(ref original) = self;
-                let mut ret = [0u64; $n_words];
-                let word_shift = shift / 64;
-                let bit_shift = shift % 64;
-
-                // shift
-                for i in word_shift..$n_words {
-                    ret[i] = original[i - word_shift] << bit_shift;
-                }
-                // carry
-                if bit_shift > 0 {
-                    for i in word_shift+1..$n_words {
-                        ret[i] += original[i - 1 - word_shift] >> (64 - bit_shift);
-                    }
-                }
-                $name(ret)
-            }
-        }
-
-        impl<'a, T> core::ops::Shl<T> for &'a $name where T: Into<$name> {
-            type Output = $name;
-            fn shl(self, shift: T) -> $name {
-                *self << shift
-            }
-        }
-
-        impl<T> core::ops::ShlAssign<T> for $name where T: Into<$name> {
-            fn shl_assign(&mut self, shift: T) {
-                *self = *self << shift;
-            }
-        }
-
-        impl<T> core::ops::Shr<T> for $name where T: Into<$name> {
-            type Output = $name;
-
-            fn shr(self, shift: T) -> $name {
-                let shift = shift.into().as_usize();
-                let $name(ref original) = self;
-                let mut ret = [0u64; $n_words];
-                let word_shift = shift / 64;
-                let bit_shift = shift % 64;
-
-                // shift
-                for i in word_shift..$n_words {
-                    ret[i - word_shift] = original[i] >> bit_shift;
-                }
-
-                // Carry
-                if bit_shift > 0 {
-                    for i in word_shift+1..$n_words {
-                        ret[i - word_shift - 1] += original[i] << (64 - bit_shift);
-                    }
-                }
-
-                $name(ret)
-            }
-        }
-
-        impl<'a, T> core::ops::Shr<T> for &'a $name where T: Into<$name> {
-            type Output = $name;
-            fn shr(self, shift: T) -> $name {
-                *self >> shift
-            }
-        }
-
-        impl<T> core::ops::ShrAssign<T> for $name where T: Into<$name> {
-            fn shr_assign(&mut self, shift: T) {
-                *self = *self >> shift;
-            }
-        }
-
-        impl core::cmp::Ord for $name {
-            fn cmp(&self, other: &$name) -> core::cmp::Ordering {
-                self.as_ref().iter().rev().cmp(other.as_ref().iter().rev())
-            }
-        }
-
-        impl core::cmp::PartialOrd for $name {
-            fn partial_cmp(&self, other: &$name) -> Option<core::cmp::Ordering> {
-                Some(self.cmp(other))
-            }
-        }
-
-        impl Zero for $name {
-            const ZERO: Self = Self([0; $n_words]);
-        }
-
-        impl Sqrt for $name {
-            type Error = ArithmeticError;
-
-            #[inline]
-            fn sqrt(self) -> Result<Self, Self::Error> {
-                #[inline]
-                fn least_significant_word_or(mut a: $name, b: u64) -> $name {
-                    a.0[0] |= b;
-                    a
-                }
-
-                let result = match u128::try_from(self) {
-                    Ok(x) => x.sqrt()?.into(),
-                    Err(_) => {
-                        let lo = (self >> 2u32).sqrt()? << 1u32;
-                        let hi = least_significant_word_or(lo, 1);
-                        let (hi_square, _): (U256, _) = hi.overflowing_mul(hi);
-                        if hi_square <= self {
-                            hi
-                        } else {
-                            lo
-                        }
-                    }
-                };
-                Ok(result)
-            }
-        }
-    };
-
-    (@unroll for $v:ident in $start:tt..$end:tt {$($c:tt)*}) => {
-        #[allow(non_upper_case_globals)]
-        #[allow(unused_comparisons)]
-        {
-            uint!(@unroll @$v, 0, $end, {
-                if $v >= $start {$($c)*}
-            }
-            );
-        }
-    };
-
-    (@unroll @$v:ident, $a:expr, 4, $c:block) => {
-        { const $v: usize = $a; $c }
-        { const $v: usize = $a + 1; $c }
-        { const $v: usize = $a + 2; $c }
-        { const $v: usize = $a + 3; $c }
-    };
-}
-
-uint! {
-    pub(crate) struct U256(4);
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn leading_zeros() {
-        fn t(x: U256, expected: u32) {
-            assert_eq!(x.leading_zeros(), expected);
-        }
-        t(U256::ZERO, 256);
-        t(1u128.into(), 255);
-        t(2u128.into(), 254);
-        t((1u128 << 127).into(), 128);
-        t(u128::MAX.into(), 128);
-        t((1u128 << 117).into(), 138);
-        t((u128::MAX >> 10).into(), 138);
-        t((u128::MAX >> 10).into(), 138);
-    }
-}
diff --git a/src/i256_polyfill.rs b/src/i256_polyfill.rs
new file mode 100644
index 0000000..ce9419c
--- /dev/null
+++ b/src/i256_polyfill.rs
@@ -0,0 +1,364 @@
+use core::cmp::{Ordering, PartialOrd};
+use core::ops::{Add, Div, Mul, Neg, Shl, Shr, Sub};
+
+use ::i256::i256 as i256_;
+
+use crate::{
+    layout::Promotion,
+    ops::{One, Zero},
+    ConvertError,
+};
+
+/// A polyfill for i256.
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[repr(transparent)]
+pub(crate) struct i256(pub i256_);
+
+static_assertions::assert_eq_size!(i256, [u128; 2]);
+
+impl i256 {
+    const I128_MAX: Self = Self::from_i128(i128::MAX);
+    const I128_MIN: Self = Self::from_i128(i128::MIN);
+    const I64_MAX: Self = Self::from_i64(i64::MAX);
+    const I64_MIN: Self = Self::from_i64(i64::MIN);
+    const MIN: Self = Self(i256_::MIN);
+
+    pub(crate) const fn from_i128(x: i128) -> Self {
+        Self(i256_::from_i128(x))
+    }
+
+    const fn from_i64(x: i64) -> Self {
+        Self(i256_::from_i64(x))
+    }
+
+    const fn from_i8(x: i8) -> Self {
+        Self(i256_::from_i8(x))
+    }
+
+    #[cfg(test)]
+    const fn new(lo: u128, hi: i128) -> Self {
+        Self(i256_::new(lo, hi))
+    }
+}
+
+impl Promotion for i256 {
+    type Layout = i128;
+
+    #[inline]
+    fn as_layout(&self) -> Self::Layout {
+        self.0.as_i128()
+    }
+
+    #[cfg(feature = "std")]
+    #[inline]
+    fn as_positive_f64(&self) -> f64 {
+        debug_assert!(*self >= Self::ZERO);
+        let hi = self.0.high() as f64;
+        let lo = self.0.low() as f64;
+        let b2p128 = 3.402823669209385e38;
+        hi * b2p128 + lo
+    }
+
+    #[inline]
+    fn leading_zeros(&self) -> u32 {
+        self.0.leading_zeros()
+    }
+
+    #[inline]
+    fn mul_l(&self, rhs: Self::Layout) -> Self {
+        Self(self.0.mul_iwide(rhs))
+    }
+
+    #[inline]
+    fn div_l(&self, rhs: Self::Layout) -> Self {
+        Self(self.0.div_iwide(rhs))
+    }
+
+    #[inline]
+    fn div_rem_l(&self, rhs: Self::Layout) -> (Self, Self::Layout) {
+        let (div, rem) = self.0.div_rem_iwide(rhs);
+        (Self(div), rem)
+    }
+}
+
+impl One for i256 {
+    const ONE: Self = Self::from_i64(1);
+}
+
+impl Zero for i256 {
+    const ZERO: Self = Self::from_i64(0);
+}
+
+impl Mul for i256 {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: Self) -> Self::Output {
+        Self(self.0 * rhs.0)
+    }
+}
+
+impl Div for i256 {
+    type Output = Self;
+
+    #[inline]
+    fn div(self, rhs: Self) -> Self::Output {
+        Self(self.0 / rhs.0)
+    }
+}
+
+impl Add for i256 {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self::Output {
+        Self(self.0 + rhs.0)
+    }
+}
+
+impl Sub for i256 {
+    type Output = Self;
+
+    #[inline]
+    fn sub(self, rhs: Self) -> Self::Output {
+        Self(self.0 - rhs.0)
+    }
+}
+
+impl Neg for i256 {
+    type Output = Self;
+
+    #[inline]
+    fn neg(self) -> Self::Output {
+        debug_assert_ne!(self, Self::MIN);
+        Self(-self.0)
+    }
+}
+
+impl Ord for i256 {
+    #[inline]
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.cmp(&other.0)
+    }
+}
+
+impl PartialOrd for i256 {
+    #[inline]
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl From<i8> for i256 {
+    #[inline]
+    fn from(x: i8) -> Self {
+        Self::from_i8(x)
+    }
+}
+
+impl From<i64> for i256 {
+    #[inline]
+    fn from(x: i64) -> Self {
+        Self::from_i64(x)
+    }
+}
+
+impl From<i128> for i256 {
+    #[inline]
+    fn from(x: i128) -> Self {
+        Self::from_i128(x)
+    }
+}
+
+impl TryFrom<i256> for i128 {
+    type Error = ConvertError;
+
+    #[inline]
+    fn try_from(x: i256) -> Result<Self, Self::Error> {
+        if !(i256::I128_MIN..=i256::I128_MAX).contains(&x) {
+            return Err(ConvertError::new("not in range"));
+        }
+
+        Ok(x.0.as_i128())
+    }
+}
+
+impl TryFrom<i256> for i64 {
+    type Error = ConvertError;
+
+    #[inline]
+    fn try_from(x: i256) -> Result<Self, Self::Error> {
+        if !(i256::I64_MIN..=i256::I64_MAX).contains(&x) {
+            return Err(ConvertError::new("not in range"));
+        }
+
+        Ok(x.0.as_i64())
+    }
+}
+
+impl Shl<u32> for i256 {
+    type Output = Self;
+
+    #[inline]
+    fn shl(self, rhs: u32) -> Self::Output {
+        Self(self.0 << rhs)
+    }
+}
+
+impl Shr<u32> for i256 {
+    type Output = Self;
+
+    #[inline]
+    fn shr(self, rhs: u32) -> Self::Output {
+        Self(self.0 >> rhs)
+    }
+}
+
+// Simple smoke tests to check that the underlying implementation is adequate.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn bounds_i128() {
+        assert_eq!(i128::try_from(i256::I128_MIN).unwrap(), i128::MIN);
+        assert_eq!(i128::try_from(i256::I128_MAX).unwrap(), i128::MAX);
+    }
+
+    #[test]
+    fn cmp() {
+        use core::cmp::Ordering::{self, *};
+        fn t(a: i128, b: i128, ord: Ordering) {
+            let a = i256::from(a);
+            let b = i256::from(b);
+            assert_eq!(a.cmp(&b), ord);
+            assert_eq!(b.cmp(&a), ord.reverse());
+        }
+        t(5, 3, Greater);
+        t(-5, -5, Equal);
+        t(0, -5, Greater);
+    }
+
+    #[test]
+    fn from_i128() {
+        fn t(x: i128) {
+            assert_eq!(i128::try_from(i256::from(x)).unwrap(), x);
+        }
+        t(0);
+        t(1);
+        t(-1);
+        t(i128::MAX);
+        t(i128::MAX - 1);
+        t(i128::MIN);
+        t(i128::MIN + 1);
+    }
+
+    #[test]
+    fn neg_i128() {
+        fn t(x: i128) {
+            assert_eq!(i128::try_from(-i256::from(x)).unwrap(), -x);
+            assert_eq!(i128::try_from(-i256::from(-x)).unwrap(), x);
+        }
+        t(0);
+        t(1);
+        t(1234);
+        t(123_456_789_987);
+    }
+
+    #[test]
+    fn neg_i256() {
+        fn t(value: i256, expected: i256) {
+            let actual: i256 = -value;
+            assert_eq!(actual, expected);
+            assert_eq!(-actual, value);
+        }
+        t(i256::new(u128::MAX, i128::MAX), i256::new(1, i128::MIN));
+        t(
+            i256::new(u128::MAX / 2, i128::MAX / 2),
+            i256::new(u128::MAX / 2 + 2, i128::MIN / 2),
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn neg_i256_min() {
+        let _x = -i256::MIN;
+    }
+
+    #[test]
+    fn add() {
+        fn t(a: i128, b: i128, expected: i128) {
+            let a = i256::from(a);
+            let b = i256::from(b);
+            assert_eq!(i128::try_from(a + b).unwrap(), expected);
+            assert_eq!(i128::try_from(b + a).unwrap(), expected);
+            assert_eq!(i128::try_from((-a) + (-b)).unwrap(), -expected);
+            assert_eq!(i128::try_from((-b) + (-a)).unwrap(), -expected);
+        }
+        t(0, 0, 0);
+        t(1111, 3210, 4321);
+        t(-1111, 5432, 4321);
+        t(-4321, 5432, 1111);
+    }
+
+    #[test]
+    fn sub() {
+        fn t(a: i128, b: i128, expected: i128) {
+            let a = i256::from(a);
+            let b = i256::from(b);
+            assert_eq!(i128::try_from(a - b).unwrap(), expected);
+            assert_eq!(i128::try_from(b - a).unwrap(), -expected);
+            assert_eq!(i128::try_from((-a) - (-b)).unwrap(), -expected);
+            assert_eq!(i128::try_from((-b) - (-a)).unwrap(), expected);
+        }
+        t(0, 0, 0);
+        t(4321, 1111, 3210);
+        t(4321, -1111, 5432);
+        t(1111, -4321, 5432);
+    }
+
+    #[test]
+    fn mul() {
+        fn t(a: i128, b: i128, expected: i128) {
+            let a = i256::from(a);
+            let b = i256::from(b);
+            assert_eq!(i128::try_from(a * b).unwrap(), expected);
+            assert_eq!(i128::try_from(b * a).unwrap(), expected);
+            assert_eq!(i128::try_from((-a) * (-b)).unwrap(), expected);
+            assert_eq!(i128::try_from((-b) * (-a)).unwrap(), expected);
+        }
+        t(0, 0, 0);
+        t(7, 5, 35);
+        t(-7, 5, -35);
+    }
+
+    #[test]
+    fn div() {
+        fn t(a: i128, b: i128, expected: i128) {
+            let a = i256::from(a);
+            let b = i256::from(b);
+            assert_eq!(i128::try_from(a / b).unwrap(), expected);
+            assert_eq!(i128::try_from((-a) / (-b)).unwrap(), expected);
+        }
+        t(0, 1, 0);
+        t(35, 5, 7);
+        t(-35, 5, -7);
+    }
+
+    #[cfg(feature = "std")]
+    #[test]
+    fn as_positive_f64() {
+        fn t(x: i256, expected: f64) {
+            assert_eq!(x.as_positive_f64(), expected);
+        }
+        t(0i64.into(), 0.0);
+        t(1i64.into(), 1.0);
+        t(i64::MAX.into(), 9.223372036854776e18);
+        t(i128::MAX.into(), 1.7014118346046923e38);
+        t(
+            i256::from(i128::MAX) * i256::from(i128::MAX),
+            2.894802230932905e76,
+        );
+    }
+}
diff --git a/src/layout.rs b/src/layout.rs
new file mode 100644
index 0000000..8dc8eda
--- /dev/null
+++ b/src/layout.rs
@@ -0,0 +1,65 @@
+use core::convert::{From, TryInto};
+use core::ops::{Add, Div, Mul, Neg, Sub};
+
+#[doc(hidden)]
+pub trait Promotion:
+    Sized + Ord + Neg + Add + Sub + Mul + Div + From<Self::Layout> + TryInto<Self::Layout>
+{
+    type Layout;
+
+    fn as_layout(&self) -> Self::Layout;
+    #[cfg(feature = "std")]
+    fn as_positive_f64(&self) -> f64;
+    fn leading_zeros(&self) -> u32;
+    fn mul_l(&self, rhs: Self::Layout) -> Self;
+    fn div_l(&self, rhs: Self::Layout) -> Self;
+    fn div_rem_l(&self, rhs: Self::Layout) -> (Self, Self::Layout);
+}
+
+#[cfg(any(feature = "i16", feature = "i32", feature = "i64"))]
+macro_rules! promotion {
+    ($layout:ty => $prom:ty) => {
+        impl Promotion for $prom {
+            type Layout = $layout;
+
+            #[inline]
+            fn as_layout(&self) -> Self::Layout {
+                *self as $layout
+            }
+
+            #[cfg(feature = "std")]
+            #[inline]
+            fn as_positive_f64(&self) -> f64 {
+                *self as f64
+            }
+
+            #[inline]
+            fn leading_zeros(&self) -> u32 {
+                (*self).leading_zeros()
+            }
+
+            #[inline]
+            fn mul_l(&self, rhs: Self::Layout) -> Self {
+                self * rhs as $prom
+            }
+
+            #[inline]
+            fn div_l(&self, rhs: Self::Layout) -> Self {
+                self / rhs as $prom
+            }
+
+            #[inline]
+            fn div_rem_l(&self, rhs: Self::Layout) -> (Self, Self::Layout) {
+                (self / rhs as $prom, (self % rhs as $prom) as Self::Layout)
+            }
+        }
+    };
+}
+
+#[cfg(feature = "i16")]
+promotion!(i16 => i32);
+#[cfg(feature = "i32")]
+promotion!(i32 => i64);
+#[cfg(feature = "i64")]
+promotion!(i64 => i128);
+// NOTE: i128 => i256 is implemented in the `i256_polyfill` module.
diff --git a/src/lib.rs b/src/lib.rs
index 5351f2c..ca48ec4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,7 +7,7 @@
 //! ## Features
 //! Turn them on in `Cargo.toml`:
 //!
-//! - `i128` — `i128` layout support which will be promoted to internally implemented `I256` for
+//! - `i128` — `i128` layout support which will be promoted to a polyfill for `i256` for
 //!   multiplication and division.
 //! - `i64` — `i64` layout support which will be promoted to `i128` for multiplication and division.
 //! - `i32` — `i32` layout support which will be promoted to `i64` for multiplication and division.
@@ -134,13 +134,12 @@
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(docsrs, feature(doc_auto_cfg))]
 
-use core::cmp::Ord;
-use core::{fmt, i64, marker::PhantomData};
+use core::{cmp::Ord, fmt, marker::PhantomData};
 
 use typenum::Unsigned;
 
 #[cfg(feature = "i128")]
-use crate::i256::I256;
+use crate::i256_polyfill::i256;
 use crate::ops::{sqrt::Sqrt, *};
 use crate::string::Stringify;
 
@@ -148,7 +147,8 @@ mod const_fn;
 mod errors;
 mod float;
 #[cfg(feature = "i128")]
-mod i256;
+mod i256_polyfill;
+mod layout;
 mod macros;
 #[cfg(feature = "parity")]
 mod parity;
@@ -172,6 +172,7 @@ mod schemars;
 #[doc(hidden)]
 pub mod _priv {
     pub use crate::const_fn::*;
+    pub use crate::layout::*;
     pub use crate::macros::Operand;
     pub use crate::ops::*;
 }
@@ -235,9 +236,10 @@ macro_rules! impl_fixed_point {
         $(#[$attr:meta])?
         inner = $layout:tt;
         promoted_to = $promotion:tt;
-        convert = $convert:expr;
         try_from = [$($try_from:ty),*];
-    ) => {
+    ) => {const _: () = {
+        use $crate::_priv::Promotion as _;
+
         $(#[$attr])?
         impl<P: Precision> FixedPoint<$layout, P> {
             /// The number of digits in the fractional part.
@@ -247,7 +249,6 @@ macro_rules! impl_fixed_point {
 
             const COEF: $layout = const_fn::pow10(Self::PRECISION) as _;
             const NEG_COEF: $layout = -Self::COEF;
-            const COEF_PROMOTED: $promotion = $convert(Self::COEF) as _;
         }
 
         $(#[$attr])?
@@ -273,19 +274,13 @@ macro_rules! impl_fixed_point {
 
             #[inline]
             fn rmul(self, rhs: Self, mode: RoundMode) -> Result<Self> {
-                // TODO: avoid 128bit arithmetic when possible,
-                //       because LLVM doesn't replace 128bit division by const with multiplication.
-
-                let value = $promotion::from(self.inner) * $promotion::from(rhs.inner);
-                // TODO: replace with multiplication by a constant.
-                let result = value / Self::COEF_PROMOTED;
-                let loss = value - result * Self::COEF_PROMOTED;
+                let value = $promotion::from(self.inner).mul_l(rhs.inner);
+                // `|loss| < COEF`, thus it fits in the layout.
+                let (result, loss) = value.div_rem_l(Self::COEF);
 
                 let mut result =
                     $layout::try_from(result).map_err(|_| ArithmeticError::Overflow)?;
 
-                // `|loss| < COEF`, thus it fits in the layout.
-                let loss = $layout::try_from(loss).unwrap();
                 let sign = self.inner.signum() * rhs.inner.signum();
 
                 let add_signed_one = if mode == RoundMode::Nearest {
@@ -310,24 +305,17 @@ macro_rules! impl_fixed_point {
 
             #[inline]
             fn rdiv(self, rhs: Self, mode: RoundMode) -> Result<Self> {
-                // TODO: avoid 128bit arithmetic when possible,
-                //       because LLVM doesn't replace 128bit division by const with multiplication.
-
                 if rhs.inner == 0 {
                     return Err(ArithmeticError::DivisionByZero);
                 }
 
-                let numerator = $promotion::from(self.inner) * Self::COEF_PROMOTED;
-                let denominator = $promotion::from(rhs.inner);
-                let result = numerator / denominator;
-                let loss = numerator - result * denominator;
+                let numerator = $promotion::from(self.inner).mul_l(Self::COEF);
+                // `|loss| < rhs`, thus it fits in the layout.
+                let (result, loss) = numerator.div_rem_l(rhs.inner);
 
                 let mut result =
                     $layout::try_from(result).map_err(|_| ArithmeticError::Overflow)?;
 
-                // `|loss| < denominator`, thus it fits in the layout.
-                let loss = $layout::try_from(loss).unwrap();
-
                 if loss != 0 {
                     let sign = self.inner.signum() * rhs.inner.signum();
 
@@ -587,8 +575,6 @@ macro_rules! impl_fixed_point {
             /// * `Ceil`: `S ≥ sqrt(F)`
             /// * `Nearest`: `Floor` or `Ceil`, which one is closer to `sqrt(F)`
             ///
-            /// The fastest mode is `Floor`.
-            ///
             /// ```
             /// # #[cfg(feature = "i64")]
             /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
@@ -617,23 +603,31 @@ macro_rules! impl_fixed_point {
                 // At first we have `S_inner = S * COEF`.
                 // We'd like to gain `sqrt(S) * COEF`:
                 // `sqrt(S) * COEF = sqrt(S * COEF^2) = sqrt(S_inner * COEF)`
-                let squared = $promotion::from(self.inner) * Self::COEF_PROMOTED;
-                let lo = squared.sqrt()?;
+                let squared = $promotion::from(self.inner).mul_l(Self::COEF);
+                let lo = squared.sqrt();
 
                 let add_one = match mode {
                     RoundMode::Floor => false,
                     RoundMode::Nearest => {
-                        let lo2 = lo * lo;
-                        // (lo+1)^2 = lo^2 +2lo + 1
-                        let hi2 = lo2 + lo + lo + $promotion::ONE;
-                        squared - lo2 >= hi2 - squared
+                        // We choose to round up iff
+                        //
+                        //  (lo+1)^2 - squared <= squared - lo^2
+                        //
+                        // However, we don't want to do calculations in the promoted type,
+                        // because it can be slow (`i128` and `i256`). So, we use modular
+                        // arithmetic (with `2^bits(layout)` modulus) to avoid it.
+
+                        let lo2 = lo.wrapping_mul(lo);
+                        // hi^2 = (lo+1)^2 = lo^2 + 2lo + 1
+                        let hi2 = lo2.wrapping_add(lo).wrapping_add(lo).wrapping_add($layout::ONE);
+                        let squared = squared.as_layout();
+                        hi2.wrapping_sub(squared) <= squared.wrapping_sub(lo2)
+                    },
+                    RoundMode::Ceil => {
+                        lo.wrapping_mul(lo) != squared.as_layout()
                     },
-                    RoundMode::Ceil if lo * lo == squared => false,
-                    RoundMode::Ceil => true,
                 };
 
-                // `sqrt` can't take more bits than `self` already does, thus `unwrap()` is ok.
-                let lo = $layout::try_from(lo).unwrap();
                 let inner = if add_one {
                     lo + $layout::ONE
                 } else {
@@ -734,12 +728,7 @@ macro_rules! impl_fixed_point {
                 }
             }
         )*
-    };
-}
-
-#[cfg(any(feature = "i64", feature = "i32", feature = "i16"))]
-const fn identity<T>(x: T) -> T {
-    x
+    };};
 }
 
 #[cfg(feature = "i16")]
@@ -747,7 +736,6 @@ impl_fixed_point!(
     #[cfg_attr(docsrs, doc(cfg(feature = "i16")))]
     inner = i16;
     promoted_to = i32;
-    convert = identity;
     try_from = [i8, u8, i16, u16, i32, u32, i64, u64, i128, u128, isize, usize];
 );
 #[cfg(feature = "i32")]
@@ -755,7 +743,6 @@ impl_fixed_point!(
     #[cfg_attr(docsrs, doc(cfg(feature = "i32")))]
     inner = i32;
     promoted_to = i64;
-    convert = identity;
     try_from = [i8, u8, i16, u16, i32, u32, i64, u64, i128, u128, isize, usize];
 );
 #[cfg(feature = "i64")]
@@ -763,14 +750,12 @@ impl_fixed_point!(
     #[cfg_attr(docsrs, doc(cfg(feature = "i64")))]
     inner = i64;
     promoted_to = i128;
-    convert = identity;
     try_from = [i8, u8, i16, u16, i32, u32, i64, u64, i128, u128, isize, usize];
 );
 #[cfg(feature = "i128")]
 impl_fixed_point!(
     #[cfg_attr(docsrs, doc(cfg(feature = "i128")))]
     inner = i128;
-    promoted_to = I256;
-    convert = I256::from_i128;
+    promoted_to = i256;
     try_from = [i8, u8, i16, u16, i32, u32, i64, u64, i128, u128, isize, usize];
 );
diff --git a/src/ops/sqrt.rs b/src/ops/sqrt.rs
index 5a2b295..efe4a66 100644
--- a/src/ops/sqrt.rs
+++ b/src/ops/sqrt.rs
@@ -1,56 +1,40 @@
-use core::mem;
+use crate::{layout::Promotion, ops::Zero};
 
-use crate::ArithmeticError;
-
-pub(crate) trait Sqrt: Sized {
-    type Error;
-
-    /// Checked square root.
-    /// For given non-negative number S returns max possible number Q such that:
-    /// `Q ≤ sqrt(S)`.
-    /// Returns `Error` for negative arguments.
-    fn sqrt(self) -> Result<Self, Self::Error>;
+pub(crate) trait Sqrt: Promotion {
+    fn sqrt(self) -> Self::Layout;
 }
 
 macro_rules! impl_sqrt {
-    ($( $int:ty ),+ $(,)?) => {
-        $( impl_sqrt!(@single $int); )*
-    };
-    (@single $int:ty) => {
-        impl Sqrt for $int {
-            type Error = ArithmeticError;
-
+    ($prom:ty) => {
+        impl Sqrt for $prom {
             /// Checked integer square root.
             /// Sqrt implementation courtesy of [`num` crate][num].
             ///
             /// [num]: https://github.com/rust-num/num-integer/blob/4d166cbb754244760e28ea4ce826d54fafd3e629/src/roots.rs#L278
             #[inline]
-            fn sqrt(self) -> Result<Self, Self::Error> {
-                #[inline]
-                const fn bits<T>() -> u32 {
-                    (mem::size_of::<T>() * 8) as _
-                }
+            fn sqrt(self) -> Self::Layout {
+                type Layout = <$prom as Promotion>::Layout;
 
                 #[cfg(feature = "std")]
                 #[inline]
-                fn guess(x: $int) -> $int {
-                    (x as f64).sqrt() as $int
+                fn guess(v: $prom) -> Layout {
+                    v.as_positive_f64().sqrt() as Layout
                 }
 
                 #[cfg(not(feature = "std"))]
                 #[inline]
-                fn guess(x: $int) -> $int {
+                fn guess(v: $prom) -> Layout {
                     #[inline]
-                    fn log2_estimate(x: $int) -> u32 {
-                        debug_assert!(x > 0);
-                        bits::<$int>() - 1 - x.leading_zeros()
+                    fn log2_estimate(v: $prom) -> u32 {
+                        debug_assert!(v > <$prom as Zero>::ZERO);
+                        (core::mem::size_of::<$prom>() as u32 * 8) - 1 - v.leading_zeros()
                     }
 
-                    1 << ((log2_estimate(x) + 1) / 2)
+                    1 << ((log2_estimate(v) + 1) / 2)
                 }
 
                 #[inline]
-                fn fixpoint(mut x: $int, f: impl Fn($int) -> $int) -> $int {
+                fn fixpoint(mut x: Layout, f: impl Fn(Layout) -> Layout) -> Layout {
                     let mut xn = f(x);
                     while x < xn {
                         x = xn;
@@ -63,30 +47,25 @@ macro_rules! impl_sqrt {
                     x
                 }
 
-                #[allow(unused_comparisons)]
-                { debug_assert!(self >= 0); }
+                debug_assert!(self >= <$prom as Zero>::ZERO);
 
-                if bits::<$int>() > 64 {
-                    // 128-bit division is slow, so do a recursive bitwise `sqrt` until it's small enough.
-                    let result = match u64::try_from(self) {
-                        Ok(x) => x.sqrt()? as _,
-                        Err(_) => {
-                            let lo = (self >> 2u32).sqrt()? << 1;
-                            let hi = lo + 1;
-                            if hi * hi <= self { hi } else { lo }
-                        }
-                    };
-                    return Ok(result);
-                }
-                if self < 4 {
-                    return Ok((self > 0).into());
+                if self < <$prom>::from(4i8) {
+                    return ((self > <$prom as Zero>::ZERO) as i8).into();
                 }
+
                 // https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Babylonian_method
-                let next = |x: $int| (self / x + x) >> 1;
-                Ok(fixpoint(guess(self), next))
+                let next = |x: Layout| (self.div_l(x).as_layout() + x) >> 1;
+                fixpoint(guess(self), next)
             }
         }
-    }
+    };
 }
 
-impl_sqrt!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
+#[cfg(feature = "i16")]
+impl_sqrt!(i32);
+#[cfg(feature = "i32")]
+impl_sqrt!(i64);
+#[cfg(feature = "i64")]
+impl_sqrt!(i128);
+#[cfg(feature = "i128")]
+impl_sqrt!(crate::i256);
diff --git a/src/string.rs b/src/string.rs
index 6e59efc..26cda7b 100644
--- a/src/string.rs
+++ b/src/string.rs
@@ -56,7 +56,7 @@ macro_rules! impl_for {
                 let prec = Self::PRECISION as usize; // TODO: negative precision?
 
                 if EXACT {
-                    if fractional_str.len() > Self::PRECISION.abs() as usize {
+                    if fractional_str.len() > Self::PRECISION.unsigned_abs() as usize {
                         return Err(ConvertError::new("requested precision is too high"));
                     }
                 }
diff --git a/tests/it/const_ctor/too_long_fractional.stderr b/tests/it/const_ctor/too_long_fractional.stderr
index 50912d3..aa71d74 100644
--- a/tests/it/const_ctor/too_long_fractional.stderr
+++ b/tests/it/const_ctor/too_long_fractional.stderr
@@ -14,7 +14,7 @@ help: the constant being evaluated
   = note: `#[deny(long_running_const_eval)]` on by default
   = note: this error originates in the macro `const_assert` which comes from the expansion of the macro `fixnum_const` (in Nightly builds, run with -Z macro-backtrace for more info)
 
-note: erroneous constant used
+note: erroneous constant encountered
  --> tests/it/const_ctor/too_long_fractional.rs:7:36
   |
 7 | const VALUE: FixedPoint<i64, U9> = fixnum_const!(0.1234567891, 9);