diff --git a/docs/changelog.md b/docs/changelog.md
index ee3c8740..24bd1da8 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -8,7 +8,10 @@ Release notes for `quimb`.
 **Enhancements:**
 
 - replace all `numba` based paralellism (`prange` and parallel vectorize) with explicit thread pool based parallelism. Should be more reliable and no need to set `NUMBA_NUM_THREADS` anymore. Remove env var `QUIMB_NUMBA_PAR`.
+- [`Circuit`](quimb.tensor.circuit.Circuit): add `dtype` and `convert_eager` options. `dtype` specifies what the computation should be performed in. `convert_eager` specifies whether to apply this (and any `to_backend` calls) as soon as gates are applied (the default for MPS circuit simulation) or just prior to contraction (the default for exact contraction simulation).
+- [`tn.full_simplify`](quimb.tensor.tensor_core.TensorNetwork.full_simplify): add `check_zero` (by default set of `"auto"`) option which explicitly checks for zero tensor norms when equalizing norms to avoid `log10(norm)` resulting in -inf or nan. Since it creates a data dependency that breaks e.g. `jax` tracing, it is optional.
 - [`schematic.Drawing`](quimb.schematic.Drawing): add `shorten` kwarg to [line drawing](quimb.schematic.Drawing.line) and [curve drawing](quimb.schematic.Drawing.curve) and examples to {ref}`schematic`.
+- [`TensorNetwork`](quimb.tensor.tensor_core.TensorNetwork): add `.backend` and `.dtype_name` properties.
 
 
 (whats-new-1-9-0)=
diff --git a/quimb/tensor/circuit.py b/quimb/tensor/circuit.py
index e8a988c4..3c5da595 100644
--- a/quimb/tensor/circuit.py
+++ b/quimb/tensor/circuit.py
@@ -15,7 +15,7 @@
 import warnings
 
 import numpy as np
-from autoray import backend_like, do, reshape
+from autoray import astype, backend_like, do, get_dtype_name, reshape
 
 import quimb as qu
 
@@ -41,6 +41,7 @@
 from .tensor_core import (
     PTensor,
     Tensor,
+    TensorNetwork,
     get_tags,
     oset_union,
     rand_uuid,
@@ -1522,6 +1523,21 @@ class Circuit:
     bra_site_ind_id : str, optional
         Use this to label 'bra' site indices when creating certain (mostly
         internal) intermediate tensor networks.
+    dtype : str, optional
+        A default dtype to perform calculations in. Depending on
+        `convert_eager`, this is enforced *after* circuit construction
+        and simplification (the default for exact simulation), or eagerly to
+        the initial state and as gates are applied (the default for MPS
+        simulation).
+    to_backend : callable, optional
+        If given, apply this function to both the initial state arrays and to
+        every gate as it is applied.
+    convert_eager : bool, optional
+        Whether to eagerly perform dtype casting and application of
+        `to_backend` as gates are supplied, or wait until after the necessary
+        TNs for a particular task such as sampling are formed and simplified.
+        Deferred conversion (`convert_eager=False`) is the default mode for
+        full contraction.
 
     Attributes
     ----------
@@ -1595,7 +1611,9 @@ def __init__(
         round_tag_id="ROUND_{}",
         tag_gate_labels=True,
         bra_site_ind_id="b{}",
+        dtype=None,
         to_backend=None,
+        convert_eager=False,
     ):
         if (N is None) and (psi0 is None):
             raise ValueError("You must supply one of `N` or `psi0`.")
@@ -1626,12 +1644,12 @@ def __init__(
         self.tag_gate_rounds = tag_gate_rounds
         self.tag_gate_labels = tag_gate_labels
 
+        self.dtype = dtype
         self.to_backend = to_backend
-        if self.to_backend is not None:
-            self._psi.apply_to_arrays(self.to_backend)
-            self._backend_gate_cache = {}
-        else:
-            self._backend_gate_cache = None
+        self.convert_eager = convert_eager
+        if self.convert_eager:
+            self._maybe_convert(self._psi)
+        self._backend_gate_cache = {}
 
         self.gate_opts = ensure_dict(gate_opts)
         self.gate_opts.setdefault("contract", gate_contract)
@@ -1665,6 +1683,8 @@ def copy(self):
         new.tag_gate_rounds = self.tag_gate_rounds
         new.tag_gate_labels = self.tag_gate_labels
         new.to_backend = self.to_backend
+        new.dtype = self.dtype
+        new.convert_eager = self.convert_eager
         new._backend_gate_cache = self._backend_gate_cache
         new._gates = self._gates.copy()
         new._ket_site_ind_id = self._ket_site_ind_id
@@ -1676,6 +1696,31 @@ def copy(self):
         new._sampled_conditionals = self._sampled_conditionals.copy()
         return new
 
+    def _maybe_convert(self, obj, dtype=None):
+        istn = isinstance(obj, TensorNetwork)
+
+        if dtype is None:
+            # use default dtype
+            dtype = self.dtype
+
+        if dtype is not None:
+            # cast array or TN to dtype
+            if istn:
+                obj.astype_(dtype)
+            else:
+                if get_dtype_name(obj) != dtype:
+                    obj = astype(obj, dtype)
+
+        if self.to_backend is not None:
+            # once dtype is enforced, apply to_backend
+            # for e.g. gpu transfer etc
+            if istn:
+                obj.apply_to_arrays(self.to_backend)
+            else:
+                obj = self.to_backend(obj)
+
+        return obj
+
     def apply_to_arrays(self, fn):
         """Apply a function to all the arrays in the circuit."""
         self._psi.apply_to_arrays(fn)
@@ -1788,6 +1833,7 @@ def from_gates(cls, gates, N=None, progbar=False, **kwargs):
 
             N = 0
             for gate in gates:
+                gate = parse_to_gate(*gate)
                 if gate.qubits:
                     N = max(N, max(gate.qubits) + 1)
                 if gate.controls:
@@ -1861,10 +1907,11 @@ def _apply_gate(self, gate, tags=None, **gate_opts):
         else:
             # gate supplied as a matrix/tensor
             G = gate.array
-            if self.to_backend is not None:
+
+            if self.convert_eager:
                 key = id(G)
                 if key not in self._backend_gate_cache:
-                    self._backend_gate_cache[key] = self.to_backend(G)
+                    self._backend_gate_cache[key] = self._maybe_convert(G)
                 G = self._backend_gate_cache[key]
 
             # apply the gate to the TN!
@@ -2377,7 +2424,9 @@ def psi(self):
         # make sure all same dtype and drop singlet dimensions
         psi = self._psi.copy()
         psi.squeeze_()
-        psi.astype_(psi.dtype)
+        if not self.convert_eager:
+            # not converted yet
+            self._maybe_convert(psi)
         return psi
 
     def get_uni(self, transposed=False):
@@ -2562,7 +2611,10 @@ def get_psi_simplified(
         if key in self._storage:
             return self._storage[key].copy()
 
-        psi = self.psi
+        # we simplify and store a copy
+        psi = self._psi.copy()
+        psi.squeeze_()
+
         # make sure to keep all outer indices
         output_inds = tuple(map(psi.site_ind, range(self.N)))
 
@@ -2645,7 +2697,7 @@ def amplitude(
         simplify_atol=1e-12,
         simplify_equalize_norms=True,
         backend=None,
-        dtype="complex128",
+        dtype=None,
         rehearse=False,
     ):
         r"""Get the amplitude coefficient of bitstring ``b``.
@@ -2706,7 +2758,7 @@ def amplitude(
 
         # perform a final simplification and cast
         psi_b.full_simplify_(**fs_opts)
-        psi_b.astype_(dtype)
+        self._maybe_convert(psi_b, dtype)
 
         if rehearse == "tn":
             return psi_b
@@ -2730,7 +2782,7 @@ def amplitude_rehearse(
         simplify_atol=1e-12,
         simplify_equalize_norms=True,
         optimize="auto-hq",
-        dtype="complex128",
+        dtype=None,
         rehearse=True,
     ):
         """Perform just the tensor network simplifications and contraction tree
@@ -2789,7 +2841,7 @@ def partial_trace(
         simplify_atol=1e-12,
         simplify_equalize_norms=True,
         backend=None,
-        dtype="complex128",
+        dtype=None,
         rehearse=False,
     ):
         r"""Perform the partial trace on the circuit wavefunction, retaining
@@ -2852,7 +2904,8 @@ def partial_trace(
             seq=simplify_sequence,
             atol=simplify_atol,
             equalize_norms=simplify_equalize_norms,
-        ).astype_(dtype)
+        )
+        self._maybe_convert(rho, dtype)
 
         if rehearse == "tn":
             return rho
@@ -2886,7 +2939,7 @@ def local_expectation(
         simplify_atol=1e-12,
         simplify_equalize_norms=True,
         backend=None,
-        dtype="complex128",
+        dtype=None,
         rehearse=False,
     ):
         r"""Compute the a single expectation value of operator ``G``, acting on
@@ -2966,7 +3019,7 @@ def local_expectation(
         rhoG = rho | TG
 
         rhoG.full_simplify_(output_inds=output_inds, **fs_opts)
-        rhoG.astype_(dtype)
+        self._maybe_convert(rhoG, dtype)
 
         if rehearse == "tn":
             return rhoG
@@ -3091,7 +3144,7 @@ def compute_marginal(
                 nm_lc.multiply_(nfact, spread_over="all")
 
         # cast to desired data type
-        nm_lc.astype_(dtype)
+        self._maybe_convert(nm_lc, dtype)
 
         if rehearse == "tn":
             return nm_lc
@@ -4122,9 +4175,7 @@ def to_dense(
             atol=simplify_atol,
             equalize_norms=simplify_equalize_norms,
         )
-
-        if dtype is not None:
-            psi.astype_(dtype)
+        self._maybe_convert(psi, dtype)
 
         if rehearse == "tn":
             return psi
@@ -4504,6 +4555,16 @@ class CircuitMPS(Circuit):
         - ``'nonlocal'``: turn the gate into a potentially nonlocal (sub) MPO
           and apply it directly. See :func:`tensor_network_1d_compress`.
 
+    dtype : str, optional
+        The data type to use for the state tensor.
+    to_backend : callable, optional
+        A function to convert tensor data to a particular backend.
+    convert_eager : bool, optional
+        Whether to eagerly perform dtype casting and application of
+        `to_backend` as gates are supplied, or wait until after the necessary
+        TNs for a particular task such as sampling are formed and simplified.
+        Eager conversion (`convert_eager=True`) is the default mode for
+        MPS simulation, unlike full contraction.
     circuit_opts
         Supplied to :class:`~quimb.tensor.circuit.Circuit`.
 
@@ -4540,6 +4601,9 @@ def __init__(
         cutoff=1e-10,
         gate_opts=None,
         gate_contract="auto-mps",
+        dtype=None,
+        to_backend=None,
+        convert_eager=True,
         **circuit_opts,
     ):
         gate_opts = ensure_dict(gate_opts)
@@ -4554,6 +4618,10 @@ def __init__(
         circuit_opts.setdefault("tag_gate_rounds", False)
         circuit_opts.setdefault("tag_gate_labels", False)
 
+        circuit_opts.setdefault("dtype", dtype)
+        circuit_opts.setdefault("to_backend", to_backend)
+        circuit_opts.setdefault("convert_eager", convert_eager)
+
         super().__init__(N, psi0, gate_opts, **circuit_opts)
 
     def _init_state(self, N, dtype="complex128"):
@@ -4571,10 +4639,10 @@ def apply_gates(self, gates, progbar=False, **gate_opts):
             )
 
         for gate in gates:
-            if isinstance(gate, Gate):
-                self._apply_gate(gate, **gate_opts)
-            else:
-                self.apply_gate(*gate, **gate_opts)
+            if not isinstance(gate, Gate):
+                gate = parse_to_gate(*gate)
+
+            self._apply_gate(gate, **gate_opts)
 
             if progbar and (gate.total_qubit_count >= 2):
                 # these don't change for single qubit gates
@@ -4586,7 +4654,10 @@ def apply_gates(self, gates, progbar=False, **gate_opts):
     @property
     def psi(self):
         # no squeeze so that bond dims of 1 preserved
-        return self._psi.copy()
+        psi = self._psi.copy()
+        if not self.convert_eager:
+            self._maybe_convert(psi)
+        return psi
 
     @property
     def uni(self):
@@ -4611,6 +4682,17 @@ def sample(
         self,
         C,
         seed=None,
+        dtype=None,
+        *,
+        qubits=None,
+        order=None,
+        group_size=None,
+        max_marginal_storage=None,
+        optimize=None,
+        backend=None,
+        simplify_sequence=None,
+        simplify_atol=None,
+        simplify_equalize_norms=None,
     ):
         """Sample the MPS circuit ``C`` times.
 
@@ -4621,7 +4703,31 @@ def sample(
         seed : None, int, or generator, optional
             A random seed or generator to use for reproducibility.
         """
-        for config, _ in self._psi.sample(C, seed=seed):
+        unsupported = (
+            qubits,
+            order,
+            group_size,
+            max_marginal_storage,
+            optimize,
+            backend,
+            simplify_sequence,
+            simplify_atol,
+            simplify_equalize_norms,
+        )
+
+        if any(x is not None for x in unsupported):
+            warnings.warn(
+                "Unsupported options for sampling an MPS circuit supplied, "
+                "ignoring: " + ", ".join(map(str, unsupported))
+            )
+
+        if dtype is not None or not self.convert_eager:
+            psi = self._psi.copy()
+            self._maybe_convert(psi, dtype)
+        else:
+            psi = self._psi
+
+        for config, _ in psi.sample(C, seed=seed):
             yield "".join(map(str, config))
 
     def fidelity_estimate(self):
@@ -4666,6 +4772,13 @@ def local_expectation(
         G,
         where,
         normalized=False,
+        dtype=None,
+        *,
+        simplify_sequence=None,
+        simplify_atol=None,
+        simplify_equalize_norms=None,
+        backend=None,
+        rehearse=None,
         **contract_opts,
     ):
         """Compute the local expectation value of a local operator at ``where``
@@ -4678,12 +4791,38 @@ def local_expectation(
             The local operator tensor.
         where : int
             The qubit to compute the expectation value at.
+        normalized : bool, optional
+            Whether to normalize the expectation value by the norm of the
+            state.
+        dtype : dtype, optional
+            If given, ensure the TN is cast to this dtype before contracting.
 
         Returns
         -------
         float
         """
-        return self._psi.local_expectation_canonical(
+        unsupported = (
+            simplify_sequence,
+            simplify_atol,
+            simplify_equalize_norms,
+            backend,
+            rehearse,
+        )
+
+        if any(x is not None for x in unsupported):
+            warnings.warn(
+                "Unsupported options for computing local_expectation with an "
+                "MPS circuit supplied, ignoring: "
+                + ", ".join(map(str, unsupported))
+            )
+
+        if dtype is not None or not self.convert_eager:
+            psi = self._psi.copy()
+            self._maybe_convert(psi, dtype)
+        else:
+            psi = self._psi
+
+        return psi.local_expectation_canonical(
             G,
             where,
             normalized=normalized,
@@ -4747,7 +4886,12 @@ def get_psi_unordered(self):
         """
         return self._psi.copy()
 
-    def sample(self, C, seed=None):
+    def sample(
+        self,
+        C,
+        seed=None,
+        dtype=None,
+    ):
         """Sample the PermMPS circuit ``C`` times.
 
         Parameters
@@ -4762,15 +4906,22 @@ def sample(self, C, seed=None):
         str
             The next sample bitstring.
         """
+        if dtype is not None or not self.convert_eager:
+            psi = self._psi.copy()
+            self._maybe_convert(psi, dtype)
+        else:
+            psi = self._psi
+
         # configuring is in physical order, so need to reorder for sampling
         ordering = self.calc_qubit_ordering()
-        for config, _ in self._psi.sample(C, seed=seed):
+        for config, _ in psi.sample(C, seed=seed):
             yield "".join(str(config[i]) for i in ordering)
 
     @property
     def psi(self):
         # need to reindex and retag the MPS
         psi = self._psi.copy()
+
         psi.view_as_(TensorNetworkGenVector)
         psi.reindex_(
             {
@@ -4784,6 +4935,10 @@ def psi(self):
                 for i, q in enumerate(self.qubits)
             }
         )
+
+        if not self.convert_eager:
+            self._maybe_convert(psi)
+
         return psi
 
 
@@ -4791,11 +4946,19 @@ class CircuitDense(Circuit):
     """Quantum circuit simulation keeping the state in full dense form."""
 
     def __init__(
-        self, N=None, psi0=None, gate_opts=None, gate_contract=True, tags=None
+        self,
+        N=None,
+        psi0=None,
+        gate_opts=None,
+        gate_contract=True,
+        tags=None,
+        convert_eager=True,
+        **circuit_opts,
     ):
         gate_opts = ensure_dict(gate_opts)
         gate_opts.setdefault("contract", gate_contract)
-        super().__init__(N, psi0, gate_opts, tags)
+        gate_opts.setdefault("convert_eager", convert_eager)
+        super().__init__(N, psi0, gate_opts, tags, **circuit_opts)
 
     @property
     def psi(self):
diff --git a/quimb/tensor/circuit_gen.py b/quimb/tensor/circuit_gen.py
index e4d93fff..450c3fce 100644
--- a/quimb/tensor/circuit_gen.py
+++ b/quimb/tensor/circuit_gen.py
@@ -135,6 +135,21 @@ def gates_to_param_circuit(gates, n, parametrize="U3", **circuit_opts):
     return circ
 
 
+def gates_1D_zigzag(n, depth, gate2="cz", seed=None):
+    ent_gates = []
+    forward_layer = [(i, i + 1) for i in range(n - 1)]
+    backward_layer = [(i + 1, i) for i in range(n - 2, -1, -1)]
+
+    for d in range(depth):
+        if d % 2 == 0:
+            ent_gates.extend(forward_layer)
+        else:
+            ent_gates.extend(backward_layer)
+
+    # inject U3 gates!
+    return inject_u3s(ent_gates, gate2=gate2, seed=seed)
+
+
 def circ_ansatz_1D_zigzag(n, depth, gate2="cz", seed=None, **circuit_opts):
     r"""A 1D circuit ansatz with forward and backward layers of entangling
     gates interleaved with U3 single qubit unitaries::
@@ -176,21 +191,27 @@ def circ_ansatz_1D_zigzag(n, depth, gate2="cz", seed=None, **circuit_opts):
     --------
     circ_ansatz_1D_rand, circ_ansatz_1D_brickwork
     """
-    ent_gates = []
-    forward_layer = [(i, i + 1) for i in range(n - 1)]
-    backward_layer = [(i + 1, i) for i in range(n - 2, -1, -1)]
+    gates = gates_1D_zigzag(n, depth, gate2=gate2, seed=seed)
+    circ = gates_to_param_circuit(gates, n, **circuit_opts)
+
+    return circ
 
+
+def gates_1D_brickwork(n, depth, cyclic=False, gate2="cz", seed=None):
+    ent_gates = []
     for d in range(depth):
-        if d % 2 == 0:
-            ent_gates.extend(forward_layer)
-        else:
-            ent_gates.extend(backward_layer)
+        # the even pairs layer
+        ent_gates.extend((i, i + 1) for i in range(0, n - 1, 2))
+        if cyclic and (n % 2 == 1):
+            ent_gates.append((n - 1, 0))
 
-    # inject U3 gates!
-    gates = inject_u3s(ent_gates, gate2=gate2, seed=seed)
-    circ = gates_to_param_circuit(gates, n, **circuit_opts)
+        # the odd pairs layer
+        ent_gates.extend((i, i + 1) for i in range(1, n - 1, 2))
+        if cyclic and (n % 2 == 0):
+            ent_gates.append((n - 1, 0))
 
-    return circ
+    # inject U3 gates!
+    return inject_u3s(ent_gates, gate2=gate2, seed=seed)
 
 
 def circ_ansatz_1D_brickwork(
@@ -239,25 +260,32 @@ def circ_ansatz_1D_brickwork(
     --------
     circ_ansatz_1D_zigzag, circ_ansatz_1D_rand
     """
-    ent_gates = []
-    for d in range(depth):
-        # the even pairs layer
-        ent_gates.extend((i, i + 1) for i in range(0, n - 1, 2))
-        if cyclic and (n % 2 == 1):
-            ent_gates.append((n - 1, 0))
-
-        # the odd pairs layer
-        ent_gates.extend((i, i + 1) for i in range(1, n - 1, 2))
-        if cyclic and (n % 2 == 0):
-            ent_gates.append((n - 1, 0))
-
-    # inject U3 gates!
-    gates = inject_u3s(ent_gates, gate2=gate2, seed=seed)
+    gates = gates_1D_brickwork(n, depth, cyclic=cyclic, gate2=gate2, seed=seed)
     circ = gates_to_param_circuit(gates, n, **circuit_opts)
 
     return circ
 
 
+def gates_1D_rand(
+    n, depth, seed=None, cyclic=False, gate2="cz", avoid_doubling=True
+):
+    if seed is not None:
+        random.seed(seed)
+
+    # the set number of entangling pairs to distribute randomly
+    ent_gates = [(i, i + 1) for i in range(n - 1) for _ in range(depth)]
+    if cyclic:
+        ent_gates.extend((n - 1, 0) for _ in range(depth))
+
+    # randomly permute the order
+    random.shuffle(ent_gates)
+
+    # inject U3 gates!
+    return inject_u3s(
+        ent_gates, avoid_doubling=avoid_doubling, gate2=gate2, seed=seed
+    )
+
+
 def circ_ansatz_1D_rand(
     n,
     depth,
@@ -296,32 +324,20 @@ def circ_ansatz_1D_rand(
     --------
     circ_ansatz_1D_zigzag, circ_ansatz_1D_brickwork
     """
-    if seed is not None:
-        random.seed(seed)
-
-    # the set number of entangling pairs to distribute randomly
-    ent_gates = [(i, i + 1) for i in range(n - 1) for _ in range(depth)]
-    if cyclic:
-        ent_gates.extend((n - 1, 0) for _ in range(depth))
-
-    # randomly permute the order
-    random.shuffle(ent_gates)
-
-    # inject U3 gates!
-    gates = inject_u3s(
-        ent_gates, avoid_doubling=avoid_doubling, gate2=gate2, seed=seed
+    gates = gates_1D_rand(
+        n,
+        depth,
+        seed=seed,
+        cyclic=cyclic,
+        gate2=gate2,
+        avoid_doubling=avoid_doubling,
     )
     circ = gates_to_param_circuit(gates, n, **circuit_opts)
 
     return circ
 
 
-def circ_a2a_rand(
-    n,
-    depth,
-    seed=None,
-    gate2="cz",
-):
+def gates_a2a_rand(n, depth, seed=None, gate2="cz"):
     if not isinstance(seed, random.Random):
         rng = random.Random(seed)
 
@@ -333,12 +349,63 @@ def circ_a2a_rand(
         for i, j in zip(qubits[::2], qubits[1::2]):
             ent_gates.append((i, j))
 
-    gates = inject_u3s(ent_gates, gate2=gate2, seed=seed)
-    circ = gates_to_param_circuit(gates, n)
+    return inject_u3s(ent_gates, gate2=gate2, seed=seed)
+
+
+def circ_a2a_rand(
+    n,
+    depth,
+    seed=None,
+    gate2="cz",
+    **circuit_opts,
+):
+    """Generate a random all-to-all quantum circuit.
+
+    Parameters
+    ----------
+    n : int
+        The number of qubits.
+    depth : int
+        The circuit depth. Each layer consists of `n // 2` entangling gates
+        applied between a random permutation of qubit pairs.
+    seed : int, optional
+        Random seed.
+    gate2 : {'cx', 'cy', 'cz', 'iswap', ..., str}, optional
+        The gate to use for the entanling pairs.
 
+    Returns
+    -------
+    Circuit
+    """
+    gates = gates_a2a_rand(n, depth, seed=seed, gate2=gate2)
+    circ = gates_to_param_circuit(gates, n, **circuit_opts)
     return circ
 
 
+def gates_qaoa(
+    terms,
+    depth,
+    gammas,
+    betas,
+):
+    n = max(itertools.chain.from_iterable(terms)) + 1
+
+    gates = []
+
+    # layer of hadamards to get into plus state
+    for i in range(n):
+        gates.append((0, "h", i))
+
+    for d in range(depth):
+        for (i, j), wij in terms.items():
+            gates.append((d, "rzz", wij * gammas[d], i, j))
+
+        for i in range(n):
+            gates.append((d, "rx", -betas[d] * 2, i))
+
+    return gates
+
+
 def circ_qaoa(
     terms,
     depth,
@@ -392,18 +459,7 @@ def circ_qaoa(
 
     n = max(itertools.chain.from_iterable(terms)) + 1
 
-    gates = []
-
-    # layer of hadamards to get into plus state
-    for i in range(n):
-        gates.append((0, "h", i))
-
-    for d in range(depth):
-        for (i, j), wij in terms.items():
-            gates.append((d, "rzz", wij * gammas[d], i, j))
-
-        for i in range(n):
-            gates.append((d, "rx", -betas[d] * 2, i))
+    gates = gates_qaoa(terms, depth, gammas, betas)
 
     circ = Circuit(n, **circuit_opts)
     circ.apply_gates(gates)
diff --git a/quimb/tensor/tensor_core.py b/quimb/tensor/tensor_core.py
index 3d45c756..e8cdb2e4 100644
--- a/quimb/tensor/tensor_core.py
+++ b/quimb/tensor/tensor_core.py
@@ -1944,6 +1944,11 @@ def dtype(self):
         """The data type of the array elements."""
         return getattr(self._data, "dtype", None)
 
+    @property
+    def dtype_name(self):
+        """The name of the data type of the array elements."""
+        return get_dtype_name(self._data)
+
     @property
     def backend(self):
         """The backend inferred from the data."""
@@ -9006,7 +9011,8 @@ def insert_compressor_between_regions(
 
         # then form the 'oblique' projectors
         Pl, Pr = decomp.compute_oblique_projectors(
-            Rl, Rr,
+            Rl,
+            Rr,
             max_bond=max_bond,
             cutoff=cutoff,
             **compress_opts,
@@ -9384,7 +9390,7 @@ def randomize(self, dtype=None, seed=None, inplace=False, **randn_opts):
 
     randomize_ = functools.partialmethod(randomize, inplace=True)
 
-    def strip_exponent(self, tid_or_tensor, value=None):
+    def strip_exponent(self, tid_or_tensor, value=None, check_zero=False):
         """Scale the elements of tensor corresponding to ``tid`` so that the
         norm of the array is some value, which defaults to ``1``. The log of
         the scaling factor, base 10, is then accumulated in the ``exponent``
@@ -9396,6 +9402,11 @@ def strip_exponent(self, tid_or_tensor, value=None):
             The tensor identifier or actual tensor.
         value : None or float, optional
             The value to scale the norm of the tensor to.
+        check_zero : bool, optional
+            Whether to check if the tensor has zero norm and in that case do
+            nothing, since the `exponent` would be -inf. Off by default to
+            avoid data dependent computational graphs when tracing and
+            computing gradients etc.
         """
         if (value is None) or (value is True):
             value = 1.0
@@ -9406,6 +9417,10 @@ def strip_exponent(self, tid_or_tensor, value=None):
             t = self.tensor_map[tid_or_tensor]
 
         stripped_factor = t.norm() / value
+
+        if check_zero and (stripped_factor == 0.0):
+            return
+
         t.modify(apply=lambda data: data / stripped_factor)
         self.exponent = self.exponent + do("log10", stripped_factor)
 
@@ -9420,7 +9435,7 @@ def distribute_exponent(self):
         # reset the exponent to zero
         self.exponent = 0.0
 
-    def equalize_norms(self, value=None, inplace=False):
+    def equalize_norms(self, value=None, check_zero=False, inplace=False):
         """Make the Frobenius norm of every tensor in this TN equal without
         changing the overall value if ``value=None``, or set the norm of every
         tensor to ``value`` by scalar multiplication only.
@@ -9431,6 +9446,11 @@ def equalize_norms(self, value=None, inplace=False):
             Set the norm of each tensor to this value specifically. If supplied
             the change in overall scaling will be accumulated in
             ``tn.exponent`` in the form of a base 10 power.
+        check_zero : bool, optional
+            Whether, if and when equalizing norms, to check if tensors have
+            zero norm and in that case do nothing, since the `exponent` would
+            be -inf. Off by default to avoid data dependent computational
+            graphs when tracing and computing gradients etc.
         inplace : bool, optional
             Whether to perform the norm equalization inplace or not.
 
@@ -9441,7 +9461,7 @@ def equalize_norms(self, value=None, inplace=False):
         tn = self if inplace else self.copy()
 
         for tid in tn.tensor_map:
-            tn.strip_exponent(tid, value=value)
+            tn.strip_exponent(tid, value=value, check_zero=check_zero)
 
         if value is None:
             tn.distribute_exponent()
@@ -9586,6 +9606,7 @@ def rank_simplify(
         equalize_norms=False,
         cache=None,
         max_combinations=500,
+        check_zero=False,
         inplace=False,
     ):
         """Simplify this tensor network by performing contractions that don't
@@ -9602,6 +9623,11 @@ def rank_simplify(
             exponent in ``tn.exponent``.
         cache : None or set
             Persistent cache used to mark already checked tensors.
+        check_zero : bool, optional
+            Whether, if and when equalizing norms, to check if tensors have
+            zero norm and in that case do nothing, since the `exponent` would
+            be -inf. Off by default to avoid data dependent computational
+            graphs when tracing and computing gradients etc.
         inplace : bool, optional
             Whether to perform the rand reduction inplace.
 
@@ -9747,7 +9773,7 @@ def rank_weight(ind):
             tn |= tab
 
             if equalize_norms:
-                tn.strip_exponent(tab, equalize_norms)
+                tn.strip_exponent(tab, equalize_norms, check_zero=check_zero)
 
             for ix in out_ab:
                 # now we need to check outputs indices again
@@ -9755,10 +9781,16 @@ def rank_weight(ind):
 
         if scalars:
             if equalize_norms:
+                # move overall scaling factor into exponent, absorb phase
                 signs = []
                 for s in scalars:
-                    signs.append(s / do("abs", s))
-                    tn.exponent += do("log10", do("abs", s))
+                    sa = do("abs", s)
+                    if check_zero and (sa == 0.0):
+                        # whole contraction is zero
+                        signs = [0.0]
+                        break
+                    signs.append(s / sa)
+                    tn.exponent += do("log10", sa)
                 scalars = signs
 
             if tn.num_tensors:
@@ -10018,6 +10050,7 @@ def split_simplify(
         atol=1e-12,
         equalize_norms=False,
         cache=None,
+        check_zero=False,
         inplace=False,
         **split_opts,
     ):
@@ -10034,6 +10067,11 @@ def split_simplify(
             exponent in ``tn.exponent``.
         cache : None or set
             Persistent cache used to mark already checked tensors.
+        check_zero : bool, optional
+            Whether, if and when equalizing norms, to check if tensors have
+            zero norm and in that case do nothing, since the `exponent` would
+            be -inf. Off by default to avoid data dependent computational
+            graphs when tracing and computing gradients etc.
         inplace, bool, optional
             Whether to perform the split simplification inplace.
         """
@@ -10070,8 +10108,12 @@ def split_simplify(
                 tn |= tr
 
                 if equalize_norms:
-                    tn.strip_exponent(tl, equalize_norms)
-                    tn.strip_exponent(tr, equalize_norms)
+                    tn.strip_exponent(
+                        tl, equalize_norms, check_zero=check_zero
+                    )
+                    tn.strip_exponent(
+                        tr, equalize_norms, check_zero=check_zero
+                    )
 
             else:
                 cache.add(cache_key)
@@ -10088,6 +10130,7 @@ def pair_simplify(
         cache=None,
         equalize_norms=False,
         max_combinations=500,
+        check_zero=False,
         inplace=False,
         **split_opts,
     ):
@@ -10175,8 +10218,8 @@ def gen_pairs():
 
             tensor_fuse_squeeze(tl, tr)
             if equalize_norms:
-                tn.strip_exponent(tl, equalize_norms)
-                tn.strip_exponent(tr, equalize_norms)
+                tn.strip_exponent(tl, equalize_norms, check_zero=check_zero)
+                tn.strip_exponent(tr, equalize_norms, check_zero=check_zero)
 
             queue.extend(tl.inds)
             queue.extend(tr.inds)
@@ -10194,6 +10237,7 @@ def loop_simplify(
         loops=None,
         cache=None,
         equalize_norms=False,
+        check_zero=False,
         inplace=False,
         **split_opts,
     ):
@@ -10213,6 +10257,11 @@ def loop_simplify(
         cache : set, optional
             For performance reasons can supply a cache for already checked
             loops.
+        check_zero : bool, optional
+            Whether, if and when equalizing norms, to check if tensors have
+            zero norm and in that case do nothing, since the `exponent` would
+            be -inf. Off by default to avoid data dependent computational
+            graphs when tracing and computing gradients etc.
         inplace : bool, optional
             Whether to replace the loops inplace.
         split_opts
@@ -10293,8 +10342,8 @@ def loop_simplify(
 
             tensor_fuse_squeeze(tl, tr)
             if equalize_norms:
-                tn.strip_exponent(tl, equalize_norms)
-                tn.strip_exponent(tr, equalize_norms)
+                tn.strip_exponent(tl, equalize_norms, check_zero=check_zero)
+                tn.strip_exponent(tr, equalize_norms, check_zero=check_zero)
 
         return tn
 
@@ -10307,13 +10356,14 @@ def full_simplify(
         atol=1e-12,
         equalize_norms=False,
         cache=None,
-        inplace=False,
-        progbar=False,
         rank_simplify_opts=None,
         loop_simplify_opts=None,
         split_simplify_opts=None,
         custom_methods=(),
         split_method="svd",
+        check_zero="auto",
+        inplace=False,
+        progbar=False,
     ):
         """Perform a series of tensor network 'simplifications' in a loop until
         there is no more reduction in the number of tensors or indices. Note
@@ -10352,6 +10402,12 @@ def full_simplify(
         cache : None or set
             A persistent cache for each simplification process to mark
             already processed tensors.
+        check_zero : bool, optional
+            Whether to check if tensors have zero norm and in that case do
+            nothing if and when equalizing norms, rather than generating a NaN.
+            If 'auto' this will only be turned on if other methods that
+            explicitly check data entries ("A", "D", "C", "S", "L") are being
+            used (the default).
         progbar : bool, optional
             Show a live progress bar of the simplification process.
         inplace : bool, optional
@@ -10378,6 +10434,10 @@ def full_simplify(
         if output_inds is None:
             output_inds = self.outer_inds()
 
+        if check_zero == "auto":
+            # any method but R checks data entries anyway
+            check_zero = bool(set(seq) - {"R"})
+
         tn.squeeze_(exclude=output_inds)
 
         if cache is None:
@@ -10417,6 +10477,7 @@ def full_simplify(
                         output_inds=ix_o,
                         cache=cache,
                         equalize_norms=equalize_norms,
+                        check_zero=check_zero,
                         **rank_simplify_opts,
                     )
                 elif meth == "A":
@@ -10430,6 +10491,7 @@ def full_simplify(
                         atol=atol,
                         cache=cache,
                         equalize_norms=equalize_norms,
+                        check_zero=check_zero,
                         **split_simplify_opts,
                     )
                 elif meth == "L":
@@ -10438,6 +10500,7 @@ def full_simplify(
                         cutoff=atol,
                         cache=cache,
                         equalize_norms=equalize_norms,
+                        check_zero=check_zero,
                         **loop_simplify_opts,
                     )
                 elif meth == "P":
@@ -10446,6 +10509,7 @@ def full_simplify(
                         cutoff=atol,
                         cache=cache,
                         equalize_norms=equalize_norms,
+                        check_zero=check_zero,
                         **loop_simplify_opts,
                     )
                 else:
@@ -10457,9 +10521,10 @@ def full_simplify(
         if equalize_norms:
             if equalize_norms is True:
                 # this also redistributes the collected exponents
-                tn.equalize_norms_()
+                value = None
             else:
-                tn.equalize_norms_(value=equalize_norms)
+                value = equalize_norms
+            tn.equalize_norms_(value=value, check_zero=check_zero)
 
         if progbar:
             pbar.close()
@@ -10589,6 +10654,7 @@ def compress_simplify(
         max_simplification_iterations=100,
         converged_tol=0.01,
         equalize_norms=True,
+        check_zero=True,
         progbar=False,
         inplace=False,
         **full_simplify_opts,
@@ -10601,6 +10667,7 @@ def compress_simplify(
         simplify_opts = {
             "atol": atol,
             "equalize_norms": equalize_norms,
+            "check_zero": check_zero,
             "progbar": progbar,
             "output_inds": output_inds,
             "cache": set(),
@@ -10678,8 +10745,21 @@ def dtype(self):
         """The dtype of this TensorNetwork, this is the minimal common type
         of all the tensors data.
         """
+        # TODO: support non numpy dtypes here
         return get_common_dtype(*self.arrays)
 
+    @property
+    def dtype_name(self):
+        """The name of the data type of the array elements."""
+        return next(iter(self.tensor_map.values())).dtype_name
+
+    @property
+    def backend(self):
+        """Get the backend of any tensor in this network, asssuming it to be
+        the same for all tensors.
+        """
+        return next(iter(self.tensor_map.values())).backend
+
     def iscomplex(self):
         return iscomplex(self)
 
diff --git a/tests/test_tensor/test_circuit.py b/tests/test_tensor/test_circuit.py
index df7085fe..2508b963 100644
--- a/tests/test_tensor/test_circuit.py
+++ b/tests/test_tensor/test_circuit.py
@@ -1,8 +1,8 @@
-import math
 import itertools
+import math
 
-import pytest
 import numpy as np
+import pytest
 from numpy.testing import assert_allclose
 
 import quimb as qu
@@ -377,8 +377,8 @@ def test_auto_split_gate(self):
 
     @pytest.mark.parametrize("gate2", ["cx", "iswap"])
     def test_circuit_simplify_tensor_network(self, gate2):
-        import random
         import itertools
+        import random
 
         depth = n = 8
 
@@ -449,6 +449,7 @@ def test_partial_trace(self):
     @pytest.mark.parametrize("group_size", (1, 2, 6))
     def test_sample(self, group_size):
         import collections
+
         from scipy.stats import power_divergence
 
         C = 2**10
@@ -469,6 +470,7 @@ def test_sample(self, group_size):
     @pytest.mark.parametrize("group_size", (1, 3))
     def test_sample_gate_by_gate(self, group_size):
         import collections
+
         from scipy.stats import power_divergence
 
         C = 2**10
@@ -490,6 +492,7 @@ def test_sample_gate_by_gate(self, group_size):
 
     def test_sample_chaotic(self):
         import collections
+
         from scipy.stats import power_divergence
 
         C = 2**12
@@ -659,6 +662,63 @@ def test_multi_controlled_circuit(self):
         (b,) = circ.sample(1, group_size=3)
         assert b[N - 2] == "0"
 
+    @pytest.mark.parametrize("dtype", [None, "complex64", "complex128"])
+    @pytest.mark.parametrize("backend", [None, "torch"])
+    @pytest.mark.parametrize("dtype_final", [None, "complex64", "complex128"])
+    @pytest.mark.parametrize("convert_eager", [True, False])
+    def test_conversions(self, dtype, backend, dtype_final, convert_eager):
+        if backend == "torch":
+            pytest.importorskip("torch")
+
+            def to_backend(x):
+                import torch
+
+                return torch.tensor(x)
+
+        else:
+            to_backend = None
+
+        circ = qtn.Circuit(
+            2, dtype=dtype, to_backend=to_backend, convert_eager=convert_eager
+        )
+        circ.h(0)
+        circ.cx(0, 1)
+        circ.y(1)
+
+        if not convert_eager:
+            # constructed with default dtype
+            assert circ._psi.dtype_name == "complex128"
+            assert circ._psi.backend == "numpy"
+        else:
+            # constructed with this type
+            assert circ._psi.dtype == dtype or dtype is None
+            if backend == "torch":
+                assert circ._psi.backend == "torch"
+            else:
+                assert circ._psi.backend == "numpy"
+
+        # converted to this type
+        if dtype is None:
+            expected_default_dtype = "complex128"
+        else:
+            expected_default_dtype = dtype
+
+        if backend != "torch":
+            test_tn_default = circ.amplitude_tn()
+            test_tn_explicit = circ.amplitude_tn(dtype=dtype_final)
+        else:
+            # test a less simplified tensor network
+            test_tn_default = circ.partial_trace_tn(
+                (1,), simplify_sequence="R"
+            )
+            test_tn_explicit = circ.partial_trace_tn(
+                (1,), simplify_sequence="R", dtype=dtype_final
+            )
+
+        assert test_tn_default.dtype_name == expected_default_dtype
+        if dtype_final is not None:
+            assert test_tn_explicit.dtype_name == dtype_final
+
 
 class TestCircuitMPS:
     def test_from_qsim_mps_swapsplit(self):