From c682d1e69b055040db0fd9cd0128687929a1aa68 Mon Sep 17 00:00:00 2001
From: vncntt <provincent8412@gmail.com>
Date: Mon, 24 Feb 2025 01:49:11 -0800
Subject: [PATCH 1/9] knights_knaves

---
 reasoning_gym/arithmetic/decimal_chain_sum.py |   1 -
 reasoning_gym/logic/knights_knaves.py         | 458 ++++++++++++++++++
 2 files changed, 458 insertions(+), 1 deletion(-)
 create mode 100644 reasoning_gym/logic/knights_knaves.py

diff --git a/reasoning_gym/arithmetic/decimal_chain_sum.py b/reasoning_gym/arithmetic/decimal_chain_sum.py
index de22bb29..d2313789 100644
--- a/reasoning_gym/arithmetic/decimal_chain_sum.py
+++ b/reasoning_gym/arithmetic/decimal_chain_sum.py
@@ -3,7 +3,6 @@
 from decimal import Decimal
 from typing import Any, Optional
 
-from ..coaching import AttributeType, BaseCurriculum, RangeAttributeDefinition
 from ..factory import ProceduralDataset, register_dataset
 
 
diff --git a/reasoning_gym/logic/knights_knaves.py b/reasoning_gym/logic/knights_knaves.py
new file mode 100644
index 00000000..0d0a7610
--- /dev/null
+++ b/reasoning_gym/logic/knights_knaves.py
@@ -0,0 +1,458 @@
+import copy
+import itertools
+from dataclasses import dataclass
+from random import Random
+from typing import Any, Optional
+
+import numpy as np
+
+from reasoning_gym.factory import ProceduralDataset, register_dataset
+
+COMMON_NAMES = [
+    "Emma",
+    "Liam",
+    "Olivia",
+    "Noah",
+    "Ava",
+    "Ethan",
+    "Sophia",
+    "Mason",
+    "Isabella",
+    "William",
+    "Mia",
+    "James",
+    "Charlotte",
+    "Benjamin",
+    "Amelia",
+    "Lucas",
+    "Harper",
+    "Henry",
+    "Evelyn",
+    "Alexander",
+    "Abigail",
+    "Michael",
+    "Emily",
+    "Daniel",
+    "Elizabeth",
+    "Jacob",
+    "Sofia",
+    "Logan",
+    "Avery",
+    "Jackson",
+    "Ella",
+    "Sebastian",
+    "Scarlett",
+    "Jack",
+    "Grace",
+    "Aiden",
+    "Chloe",
+    "Owen",
+    "Victoria",
+    "Samuel",
+    "Riley",
+    "Matthew",
+    "Aria",
+    "Joseph",
+    "Lily",
+    "Luke",
+    "Aurora",
+    "David",
+    "Zoey",
+    "Oliver",
+    "Penelope",
+]
+
+KNIGHT_KNAVE_PAIRS = [
+    ["a pioneer", "a laggard"],
+    ["a saint", "a sinner"],
+    ["a hero", "a villain"],
+    ["an angel", "a devil"],
+    ["an altruist", "an egoist"],
+    ["a sage", "a fool"],
+]
+
+PREFIX = (
+    "A very special island is inhabited only by {knight}s and {knave}s. "
+    + "{Knight}s always tell the truth, and {knave}s always lie. "
+)
+
+POSTFIX = "So who is {a_knight} and who is {a_knave}?"
+
+TEMPLATES = [
+    "{name} said that {content}.",
+    "{name} told you that {content}.",
+    '{name} said, "{content}."',
+    '{name} stated, "{content}".',
+    'According to {name}, "{content}".',
+    'In {name}\'s words: "{content}".',
+    '{name} remarked, "{content}".',
+    '"{content}," {name} declared.',
+    '{name} was heard saying, "{content}".',
+    "{name} expressed that {content}.",
+    '"{content}" - {name}.',
+    'As {name} put it, "{content}".',
+    '{name} asserted: "{content}".',
+    '"{content}," {name} mentioned.',
+    '{name} commented, "{content}".',
+    'In a statement by {name}: "{content}".',
+    '{name} noted, "{content}".',
+    '"{content}," {name} claimed.',
+]
+
+
+@dataclass
+class KnightsKnavesConfig:
+    """
+    Configuration for knights and knaves task generation.
+
+    :param n_people: Number of people in the problem
+    :param depth_constraint: Maximum depth of each person's statement
+    :param width_constraint: Maximum width (number of branches) of each person's statement
+    :param size: Number of items in the dataset ??????
+    :param seed: Random seed
+    """
+
+    n_people: int = 2
+    depth_constraint: int = 2
+    width_constraint: int = 2
+    size: int = 500
+    seed: Optional[int] = None
+
+    def validate(self):
+        assert 1 <= self.n_people, "Number of people must be at least 1"
+        assert 1 <= self.depth_constraint, "Depth constraint must be at least 1"
+        assert 1 <= self.width_constraint, "Width constraint must be at least 1"
+
+
+class KKProblemSampler:
+    def __init__(self, rand_seed: int, n_people: int, depth_constraint: int = 2, width_constraint: int = 2):
+        self.rng = np.random.default_rng(rand_seed)
+        self.n_people = n_people
+        self.depth_constraint = depth_constraint
+        self.width_constraint = width_constraint
+
+    def sample(self):
+        statements = tuple(
+            self._sample_statement(person_id, self.depth_constraint) for person_id in range(self.n_people)
+        )
+        return self._immutable_statements(statements)
+
+    def sample_valid_problems(
+        self,
+        n_problems: int,
+        max_retry: int = 1000,
+        skip_no_solution: bool = True,
+        skip_multiple_solutions: bool = True,
+    ):
+        problems = []
+        unique_statements = set()
+        for _ in range(n_problems):
+            for _ in range(max_retry):
+                statements = self.sample()
+                if statements in unique_statements:
+                    continue
+                solutions = KnightsKnavesDataset.find_solution(statements)
+                if len(solutions) == 0 and skip_no_solution:
+                    continue
+                if len(solutions) > 1 and skip_multiple_solutions:
+                    continue
+                sol = solutions[0] if len(solutions) > 0 else None
+                problems.append({"statements": statements, "solution": sol, "all_solutions": solutions})
+                unique_statements.add(statements)
+                break
+        return problems
+
+    def _sample_statement(self, person_id: int, depth_constraint: int):
+        dice = self.rng.integers(0, 6)
+        if depth_constraint == 1 or dice == 0:
+            while True:
+                knight_or_knave = self.rng.choice(["telling-truth", "lying"])
+                person = self.rng.integers(0, self.n_people)
+                if not (knight_or_knave == "lying" and person == person_id):
+                    return (knight_or_knave, person)
+        if dice == 1:
+            return ("not", self._sample_statement(person_id, depth_constraint - 1))
+        if dice in [2, 3]:
+            operator = ["and", "or"][dice - 2]
+            n_substatements = self.rng.integers(2, self.width_constraint + 1)
+            return (operator,) + self._sample_substatements(person_id, depth_constraint, n_substatements)
+        if dice in [4, 5]:
+            operator = ["->", "<=>"][dice - 4]
+            return (operator,) + self._sample_substatements(person_id, depth_constraint, 2)
+
+    def _sample_substatements(self, person_id: int, depth_constraint: int, count: int, dedup: bool = True):
+        sub_statements = []
+        dedup_set = set()
+        while True:
+            stmt = self._sample_statement(person_id, depth_constraint - 1)
+            if dedup:
+                if stmt in dedup_set:
+                    continue
+                dedup_set.add(stmt)
+            sub_statements.append(stmt)
+            if len(sub_statements) == count:
+                break
+        return tuple(sub_statements)
+
+    def _immutable_statements(self, mutable_statements):
+        def _make_immutable(x):
+            if isinstance(x, (list, tuple)):
+                return tuple(_make_immutable(child) for child in x)
+            if isinstance(x, np.str_):
+                return str(x)
+            if isinstance(x, np.int64):
+                return int(x)
+            return x
+
+        return tuple(_make_immutable(s) for s in mutable_statements)
+
+
+class KKProblemFormatter:
+    def __init__(self, rand_seed, problem):
+        self.rng = np.random.default_rng(rand_seed)
+        self.problem = problem
+
+    def format_problem(
+        self,
+        random_names=True,
+        random_saying_template=True,
+        random_knight_knave_pairs=True,
+        flip_knight_knave_pair=False,
+        uncommon_name=False,
+        reorder_statement=False,
+    ):
+        statements = copy.deepcopy(self.problem["statements"])
+        n_people = len(statements)
+        names = list(self.rng.choice(COMMON_NAMES, size=n_people, replace=False))
+        knight_knave = ["a knight", "a knave"]
+        if random_knight_knave_pairs:
+            knight_knave = self.rng.choice(KNIGHT_KNAVE_PAIRS)
+        knight_knave = {
+            "knight": knight_knave[0].split()[1],
+            "knave": knight_knave[1].split()[1],
+            "a_knight": knight_knave[0],
+            "a_knave": knight_knave[1],
+        }
+        knight_knave["Knight"] = knight_knave["knight"].capitalize()
+        knight_knave["Knave"] = knight_knave["knave"].capitalize()
+
+        text = PREFIX.format(**knight_knave)
+        text += f"You meet {n_people} inhabitants: "
+        text += ", ".join(names[:-1]) + ", and " + names[-1] + "."
+
+        text_statements = []
+        for i, stmt in enumerate(statements):
+            tmpl = self.rng.choice(TEMPLATES)
+            content = self._format_statement(names, knight_knave, stmt)
+            text_statements.append(" " + tmpl.format(name=names[i], content=content))
+
+        text += "".join(text_statements)
+        text += " " + POSTFIX.format(**knight_knave)
+        format = ", ".join(f"{name} is a {knight_knave['knight']}/{knight_knave['knave']}" for name in names[:-1])
+        if len(names) > 1:
+            format += f", and {names[-1]} is a {knight_knave['knight']}/{knight_knave['knave']}"
+        else:
+            format = f"{names[0]} is a {knight_knave['knight']}/{knight_knave['knave']}"
+
+        text += f' (Format your answer like: "{format}")'
+
+        if self.problem["solution"] is None:
+            solution_text = "No valid solution exists."
+        else:
+            solution_stmts = []
+            for name, indicator in zip(names, self.problem["solution"]):
+                if indicator:
+                    solution_stmts.append(name + " is " + knight_knave["a_knight"])
+                else:
+                    solution_stmts.append(name + " is " + knight_knave["a_knave"])
+            solution_text = ", ".join(solution_stmts[:-1]) + ", and " + solution_stmts[-1] + "."
+        return {
+            "quiz": text,
+            "names": names,
+            "knight_knave": knight_knave,
+            "solution": self.problem["solution"],
+            "solution_text": solution_text,
+        }
+
+    def _format_statement(self, names, knight_knave, statement):
+        if statement[0] == "not":
+            return self._format_knight_knave(names, knight_knave, statement[1], negation=True)
+        if statement[0] in ["and", "or"]:
+            return (" " + statement[0] + " ").join(
+                self._format_knight_knave(names, knight_knave, sub_stmt) for sub_stmt in statement[1:]
+            )
+        if statement[0] == "->":
+            return (
+                "If "
+                + self._format_knight_knave(names, knight_knave, statement[1])
+                + " then "
+                + self._format_knight_knave(names, knight_knave, statement[2])
+            )
+        if statement[0] == "<=>":
+            return (
+                self._format_knight_knave(names, knight_knave, statement[1])
+                + " if and only if "
+                + self._format_knight_knave(names, knight_knave, statement[2])
+            )
+        return self._format_knight_knave(names, knight_knave, statement)
+
+    def _format_knight_knave(self, names, knight_knave, statement, negation=False):
+        assert statement[0] in ("telling-truth", "lying")
+        text = names[statement[1]] + " is "
+        if negation:
+            text += "not "
+        text += {"telling-truth": knight_knave["a_knight"], "lying": knight_knave["a_knave"]}[statement[0]]
+        return text
+
+
+class KnightsKnavesDataset(ProceduralDataset):
+    """
+    Generates random knights and knaves problems.
+    """
+
+    def __init__(self, config: KnightsKnavesConfig):
+        super().__init__(config, seed=config.seed, size=config.size)
+
+    @staticmethod
+    def find_solution(statements):
+        """Find solutions given a list of statements."""
+        n_people = len(statements)
+        single_statement = ("and",) + tuple(
+            ("<=>", ("telling-truth", i), statements[i]) for i in range(len(statements))
+        )
+        # Brute force
+        solutions = []
+        for assignments in itertools.product([True, False], repeat=n_people):
+            # if KnightsKnavesDataset.test_satisfiability(single_statement, assignments):
+            if KnightsKnavesDataset.test_satisfiability(single_statement, assignments):
+                solutions.append(assignments)
+        return solutions
+
+    @staticmethod
+    def test_satisfiability(statement, assignments):
+        """Recursively test if a statement is satisfied under given assignments."""
+        if statement[0] == "telling-truth":
+            return assignments[statement[1]]
+        if statement[0] == "lying":
+            return not assignments[statement[1]]
+        if statement[0] == "not":
+            return not KnightsKnavesDataset.test_satisfiability(statement[1], assignments)
+        if statement[0] == "and":
+            return np.all(
+                [KnightsKnavesDataset.test_satisfiability(statement[i], assignments) for i in range(1, len(statement))]
+            )
+        if statement[0] == "or":
+            return np.any(
+                [KnightsKnavesDataset.test_satisfiability(statement[i], assignments) for i in range(1, len(statement))]
+            )
+        if statement[0] == "->":
+            val1 = KnightsKnavesDataset.test_satisfiability(statement[1], assignments)
+            val2 = KnightsKnavesDataset.test_satisfiability(statement[2], assignments)
+            return (not val1) or val2
+        if statement[0] == "<=>":
+            val1 = KnightsKnavesDataset.test_satisfiability(statement[1], assignments)
+            val2 = KnightsKnavesDataset.test_satisfiability(statement[2], assignments)
+            return (val1 and val2) or ((not val1) and (not val2))
+        raise KeyError(f"Unknown statement: {statement}")
+
+    def __getitem__(self, idx: int) -> dict[str, Any]:
+        """
+        Generate a single knights and knaves problem item.
+
+        Args:
+            idx: Index of the item to generate
+
+        Returns:
+            dict containing at least:
+                - question: str (the puzzle in natural language)
+                - answer: str (the solution in text)
+                - metadata: dict (additional problem details)
+        """
+        rng = Random(self.seed + idx if self.seed is not None else None)
+        return self.__generate_problem(rng)
+
+    def __generate_problem(self, rng: Random) -> dict[str, Any]:
+        """
+        Generate a single knights and knaves problem with a unique solution.
+        """
+
+        # Sample a valid problem using the original KKProblemSampler logic
+        # Use the sampler to generate a valid problem
+        sampler = KKProblemSampler(
+            rand_seed=rng.randint(0, 2**32),
+            n_people=self.config.n_people,
+            depth_constraint=self.config.depth_constraint,
+            width_constraint=self.config.width_constraint,
+        )
+        problems = sampler.sample_valid_problems(1, skip_no_solution=True, skip_multiple_solutions=True)
+        problem = problems[0]
+
+        # Format the problem using the original KKProblemFormatter logic
+
+        # Format the problem
+        formatter = KKProblemFormatter(rand_seed=rng.randint(0, 2**32), problem=problem)
+        formatted = formatter.format_problem()
+
+        # Prepare the return dictionary
+        question = formatted["quiz"]
+        answer = formatted["solution_text"]
+        metadata = {
+            "statements": problem["statements"],
+            "solution": problem["solution"],
+            "names": formatted["names"],
+            "knight_knave_terms": formatted["knight_knave"],
+        }
+
+        return {"question": question, "answer": answer, "metadata": metadata}
+
+    def _normalize_answer(self, answer: str) -> set[tuple[str, str]]:
+        """Convert answer string into normalized set of (name, role) tuples"""
+        # Remove common punctuation and standardize spacing
+        answer = answer.lower().strip().replace(".", "").replace(",", "")
+
+        # Split on 'and' or spaces for different formats
+        parts = [p.strip() for p in answer.replace(" and ", " ").split()]
+
+        # Extract name-role pairs
+        assignments = set()
+        current_name = None
+
+        for part in parts:
+            if part in ["is", "a"]:
+                continue
+            if part in ["knight", "knave"]:
+                if current_name:
+                    assignments.add((current_name, part))
+                    current_name = None
+            else:
+                current_name = part
+
+        return assignments
+
+    def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
+        """Score an answer against the oracle answer."""
+        if answer is None or len(answer) == 0:
+            return 0.0
+
+        try:
+            oracle_assignments = self._normalize_answer(entry["answer"])
+            answer_assignments = self._normalize_answer(answer)
+
+            # Full credit for exact assignments regardless of order
+            if oracle_assignments == answer_assignments:
+                return 1.0
+
+            # Partial credit if all names are present and some assignments match
+            if len(oracle_assignments) == len(answer_assignments):
+                matching = len(oracle_assignments.intersection(answer_assignments))
+                if matching > 0:
+                    return 0.3 + (0.7 * matching / len(oracle_assignments))
+
+            return 0.01
+
+        except Exception:
+            # If parsing fails, give minimal credit
+            return 0.01
+
+
+register_dataset("knights_knaves", KnightsKnavesDataset, KnightsKnavesConfig)

From a8224d43b8d78b267023bebd5e644accc7bd34bb Mon Sep 17 00:00:00 2001
From: vncntt <provincent8412@gmail.com>
Date: Mon, 24 Feb 2025 02:23:14 -0800
Subject: [PATCH 2/9] fix scoring bug

---
 reasoning_gym/logic/knights_knaves.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/reasoning_gym/logic/knights_knaves.py b/reasoning_gym/logic/knights_knaves.py
index 0d0a7610..bdb83c14 100644
--- a/reasoning_gym/logic/knights_knaves.py
+++ b/reasoning_gym/logic/knights_knaves.py
@@ -63,6 +63,7 @@
 ]
 
 KNIGHT_KNAVE_PAIRS = [
+    ["a knight", "a knave"],
     ["a pioneer", "a laggard"],
     ["a saint", "a sinner"],
     ["a hero", "a villain"],
@@ -71,6 +72,8 @@
     ["a sage", "a fool"],
 ]
 
+VALID_ROLES = {pair[i].split()[1] for pair in KNIGHT_KNAVE_PAIRS for i in range(2)}
+
 PREFIX = (
     "A very special island is inhabited only by {knight}s and {knave}s. "
     + "{Knight}s always tell the truth, and {knave}s always lie. "
@@ -420,7 +423,7 @@ def _normalize_answer(self, answer: str) -> set[tuple[str, str]]:
         for part in parts:
             if part in ["is", "a"]:
                 continue
-            if part in ["knight", "knave"]:
+            if part in VALID_ROLES:
                 if current_name:
                     assignments.add((current_name, part))
                     current_name = None

From e12242dc4ebea9a6ddbcde53ec694259283a55db Mon Sep 17 00:00:00 2001
From: vncntt <provincent8412@gmail.com>
Date: Mon, 24 Feb 2025 20:58:38 -0800
Subject: [PATCH 3/9] unit tests + r1 eval

---
 eval/r1/test/knights_knaves.json      |  98 ++++++++++++++
 eval/r1/test/summary.json             | 100 ++++++++++++++
 eval/r1/yaml/test.yaml                |   2 +-
 reasoning_gym/logic/__init__.py       |   3 +
 reasoning_gym/logic/knights_knaves.py |   8 +-
 tests/test_knights_knaves.py          | 187 ++++++++++++++++++++++++++
 6 files changed, 394 insertions(+), 4 deletions(-)
 create mode 100644 eval/r1/test/knights_knaves.json
 create mode 100644 eval/r1/test/summary.json
 create mode 100644 tests/test_knights_knaves.py

diff --git a/eval/r1/test/knights_knaves.json b/eval/r1/test/knights_knaves.json
new file mode 100644
index 00000000..79ea045d
--- /dev/null
+++ b/eval/r1/test/knights_knaves.json
@@ -0,0 +1,98 @@
+{
+  "dataset_name": "knights_knaves",
+  "model": "deepseek/deepseek-r1",
+  "size": 10,
+  "provider": "Nebius",
+  "average_score": 1.0,
+  "total_examples": 10,
+  "timestamp": "2025-02-24T20:50:44.558572",
+  "config": {
+    "n_people": 2,
+    "depth_constraint": 2,
+    "width_constraint": 2,
+    "size": 10,
+    "seed": 42
+  },
+  "results": [
+    {
+      "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Zoey, and Riley. Zoey commented, \"Riley is a fool\". In Riley's words: \"Zoey is a sage or Riley is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Zoey is a sage/fool, and Riley is a sage/fool\")",
+      "expected_answer": "Zoey is a fool, and Riley is a sage.",
+      "model_answer": "Zoey is a fool, and Riley is a sage",
+      "full_model_response": "<answer>Zoey is a fool, and Riley is a sage</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('lying', 1), ('or', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Zoey'), np.str_('Riley')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Riley, and Olivia. \"If Riley is a pioneer then Olivia is a laggard\" - Riley. Olivia stated, \"Olivia is a pioneer and Riley is a laggard\". So who is a pioneer and who is a laggard? (Format your answer like: \"Riley is a pioneer/laggard, and Olivia is a pioneer/laggard\")",
+      "expected_answer": "Riley is a pioneer, and Olivia is a laggard.",
+      "model_answer": "Riley is a pioneer, and Olivia is a laggard",
+      "full_model_response": "<answer>Riley is a pioneer, and Olivia is a laggard</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('and', ('telling-truth', 1), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Riley'), np.str_('Olivia')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Samuel, and Jacob. Samuel expressed that If Samuel is a saint then Jacob is a sinner. Jacob was heard saying, \"If Samuel is a saint then Samuel is a sinner\". So who is a saint and who is a sinner? (Format your answer like: \"Samuel is a saint/sinner, and Jacob is a saint/sinner\")",
+      "expected_answer": "Samuel is a saint, and Jacob is a sinner.",
+      "model_answer": "Samuel is a saint, and Jacob is a sinner",
+      "full_model_response": "<answer>Samuel is a saint, and Jacob is a sinner</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('->', ('telling-truth', 0), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Samuel'), np.str_('Jacob')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Olivia, and Lily. \"Lily is a laggard and Lily is a pioneer\" - Olivia. Lily noted, \"If Olivia is a pioneer then Lily is a pioneer\". So who is a pioneer and who is a laggard? (Format your answer like: \"Olivia is a pioneer/laggard, and Lily is a pioneer/laggard\")",
+      "expected_answer": "Olivia is a laggard, and Lily is a pioneer.",
+      "model_answer": "Olivia is a laggard, and Lily is a pioneer",
+      "full_model_response": "<answer>Olivia is a laggard, and Lily is a pioneer</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('and', ('lying', 1), ('telling-truth', 1)), ('->', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Olivia'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Mason, and Jack. Mason expressed that If Jack is an egoist then Mason is an altruist. In Jack's words: \"Mason is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Mason is a altruist/egoist, and Jack is a altruist/egoist\")",
+      "expected_answer": "Mason is an altruist, and Jack is an egoist.",
+      "model_answer": "Mason is a altruist, and Jack is a egoist",
+      "full_model_response": "<answer>Mason is a altruist, and Jack is a egoist</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('->', ('lying', 1), ('telling-truth', 0)), ('not', ('telling-truth', 0))), 'solution': (True, False), 'names': [np.str_('Mason'), np.str_('Jack')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by heros and villains. Heros always tell the truth, and villains always lie. You meet 2 inhabitants: Benjamin, and Oliver. Benjamin noted, \"Oliver is a villain or Benjamin is a hero\". Oliver commented, \"If Oliver is a hero then Benjamin is a villain\". So who is a hero and who is a villain? (Format your answer like: \"Benjamin is a hero/villain, and Oliver is a hero/villain\")",
+      "expected_answer": "Benjamin is a villain, and Oliver is a hero.",
+      "model_answer": "Benjamin is a villain, and Oliver is a hero",
+      "full_model_response": "<answer>Benjamin is a villain, and Oliver is a hero</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('or', ('lying', 1), ('telling-truth', 0)), ('->', ('telling-truth', 1), ('lying', 0))), 'solution': (False, True), 'names': [np.str_('Benjamin'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'hero', 'knave': 'villain', 'a_knight': np.str_('a hero'), 'a_knave': np.str_('a villain'), 'Knight': 'Hero', 'Knave': 'Villain'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Alexander, and Sofia. Alexander expressed that Sofia is a saint and Sofia is a sinner. \"Alexander is not a sinner,\" Sofia claimed. So who is a saint and who is a sinner? (Format your answer like: \"Alexander is a saint/sinner, and Sofia is a saint/sinner\")",
+      "expected_answer": "Alexander is a sinner, and Sofia is a sinner.",
+      "model_answer": "Alexander is a sinner, and Sofia is a sinner",
+      "full_model_response": "<answer>Alexander is a sinner, and Sofia is a sinner</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('and', ('telling-truth', 1), ('lying', 1)), ('not', ('lying', 0))), 'solution': (False, False), 'names': [np.str_('Alexander'), np.str_('Sofia')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Sophia, and Oliver. Sophia said that Sophia is an altruist if and only if Oliver is an egoist. Oliver noted, \"Sophia is an egoist if and only if Oliver is an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Sophia is a altruist/egoist, and Oliver is a altruist/egoist\")",
+      "expected_answer": "Sophia is an egoist, and Oliver is an egoist.",
+      "model_answer": "Sophia is a egoist, and Oliver is a egoist",
+      "full_model_response": "<answer>Sophia is a egoist, and Oliver is a egoist</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('lying', 1)), ('<=>', ('lying', 0), ('telling-truth', 1))), 'solution': (False, False), 'names': [np.str_('Sophia'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Riley, and Lily. Riley commented, \"Riley is an altruist if and only if Lily is an altruist\". In Lily's words: \"Riley is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Riley is a altruist/egoist, and Lily is a altruist/egoist\")",
+      "expected_answer": "Riley is an egoist, and Lily is an altruist.",
+      "model_answer": "Riley is a egoist, and Lily is a altruist",
+      "full_model_response": "<answer>Riley is a egoist, and Lily is a altruist</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('telling-truth', 1)), ('not', ('telling-truth', 0))), 'solution': (False, True), 'names': [np.str_('Riley'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
+    },
+    {
+      "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Abigail, and Lily. \"Lily is not a fool,\" Abigail mentioned. As Lily put it, \"Abigail is a fool or Abigail is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Abigail is a sage/fool, and Lily is a sage/fool\")",
+      "expected_answer": "Abigail is a sage, and Lily is a sage.",
+      "model_answer": "Abigail is a sage, and Lily is a sage",
+      "full_model_response": "<answer>Abigail is a sage, and Lily is a sage</answer>",
+      "score": 1.0,
+      "metadata": "{'statements': (('not', ('lying', 1)), ('or', ('lying', 0), ('telling-truth', 0))), 'solution': (True, True), 'names': [np.str_('Abigail'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
+    }
+  ]
+}
diff --git a/eval/r1/test/summary.json b/eval/r1/test/summary.json
new file mode 100644
index 00000000..f89fb259
--- /dev/null
+++ b/eval/r1/test/summary.json
@@ -0,0 +1,100 @@
+[
+  {
+    "dataset_name": "knights_knaves",
+    "model": "deepseek/deepseek-r1",
+    "size": 10,
+    "provider": "Nebius",
+    "average_score": 1.0,
+    "total_examples": 10,
+    "timestamp": "2025-02-24T20:50:44.558572",
+    "config": {
+      "n_people": 2,
+      "depth_constraint": 2,
+      "width_constraint": 2,
+      "size": 10,
+      "seed": 42
+    },
+    "results": [
+      {
+        "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Zoey, and Riley. Zoey commented, \"Riley is a fool\". In Riley's words: \"Zoey is a sage or Riley is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Zoey is a sage/fool, and Riley is a sage/fool\")",
+        "expected_answer": "Zoey is a fool, and Riley is a sage.",
+        "model_answer": "Zoey is a fool, and Riley is a sage",
+        "full_model_response": "<answer>Zoey is a fool, and Riley is a sage</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('lying', 1), ('or', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Zoey'), np.str_('Riley')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Riley, and Olivia. \"If Riley is a pioneer then Olivia is a laggard\" - Riley. Olivia stated, \"Olivia is a pioneer and Riley is a laggard\". So who is a pioneer and who is a laggard? (Format your answer like: \"Riley is a pioneer/laggard, and Olivia is a pioneer/laggard\")",
+        "expected_answer": "Riley is a pioneer, and Olivia is a laggard.",
+        "model_answer": "Riley is a pioneer, and Olivia is a laggard",
+        "full_model_response": "<answer>Riley is a pioneer, and Olivia is a laggard</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('and', ('telling-truth', 1), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Riley'), np.str_('Olivia')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Samuel, and Jacob. Samuel expressed that If Samuel is a saint then Jacob is a sinner. Jacob was heard saying, \"If Samuel is a saint then Samuel is a sinner\". So who is a saint and who is a sinner? (Format your answer like: \"Samuel is a saint/sinner, and Jacob is a saint/sinner\")",
+        "expected_answer": "Samuel is a saint, and Jacob is a sinner.",
+        "model_answer": "Samuel is a saint, and Jacob is a sinner",
+        "full_model_response": "<answer>Samuel is a saint, and Jacob is a sinner</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('->', ('telling-truth', 0), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Samuel'), np.str_('Jacob')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Olivia, and Lily. \"Lily is a laggard and Lily is a pioneer\" - Olivia. Lily noted, \"If Olivia is a pioneer then Lily is a pioneer\". So who is a pioneer and who is a laggard? (Format your answer like: \"Olivia is a pioneer/laggard, and Lily is a pioneer/laggard\")",
+        "expected_answer": "Olivia is a laggard, and Lily is a pioneer.",
+        "model_answer": "Olivia is a laggard, and Lily is a pioneer",
+        "full_model_response": "<answer>Olivia is a laggard, and Lily is a pioneer</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('and', ('lying', 1), ('telling-truth', 1)), ('->', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Olivia'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Mason, and Jack. Mason expressed that If Jack is an egoist then Mason is an altruist. In Jack's words: \"Mason is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Mason is a altruist/egoist, and Jack is a altruist/egoist\")",
+        "expected_answer": "Mason is an altruist, and Jack is an egoist.",
+        "model_answer": "Mason is a altruist, and Jack is a egoist",
+        "full_model_response": "<answer>Mason is a altruist, and Jack is a egoist</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('->', ('lying', 1), ('telling-truth', 0)), ('not', ('telling-truth', 0))), 'solution': (True, False), 'names': [np.str_('Mason'), np.str_('Jack')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by heros and villains. Heros always tell the truth, and villains always lie. You meet 2 inhabitants: Benjamin, and Oliver. Benjamin noted, \"Oliver is a villain or Benjamin is a hero\". Oliver commented, \"If Oliver is a hero then Benjamin is a villain\". So who is a hero and who is a villain? (Format your answer like: \"Benjamin is a hero/villain, and Oliver is a hero/villain\")",
+        "expected_answer": "Benjamin is a villain, and Oliver is a hero.",
+        "model_answer": "Benjamin is a villain, and Oliver is a hero",
+        "full_model_response": "<answer>Benjamin is a villain, and Oliver is a hero</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('or', ('lying', 1), ('telling-truth', 0)), ('->', ('telling-truth', 1), ('lying', 0))), 'solution': (False, True), 'names': [np.str_('Benjamin'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'hero', 'knave': 'villain', 'a_knight': np.str_('a hero'), 'a_knave': np.str_('a villain'), 'Knight': 'Hero', 'Knave': 'Villain'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Alexander, and Sofia. Alexander expressed that Sofia is a saint and Sofia is a sinner. \"Alexander is not a sinner,\" Sofia claimed. So who is a saint and who is a sinner? (Format your answer like: \"Alexander is a saint/sinner, and Sofia is a saint/sinner\")",
+        "expected_answer": "Alexander is a sinner, and Sofia is a sinner.",
+        "model_answer": "Alexander is a sinner, and Sofia is a sinner",
+        "full_model_response": "<answer>Alexander is a sinner, and Sofia is a sinner</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('and', ('telling-truth', 1), ('lying', 1)), ('not', ('lying', 0))), 'solution': (False, False), 'names': [np.str_('Alexander'), np.str_('Sofia')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Sophia, and Oliver. Sophia said that Sophia is an altruist if and only if Oliver is an egoist. Oliver noted, \"Sophia is an egoist if and only if Oliver is an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Sophia is a altruist/egoist, and Oliver is a altruist/egoist\")",
+        "expected_answer": "Sophia is an egoist, and Oliver is an egoist.",
+        "model_answer": "Sophia is a egoist, and Oliver is a egoist",
+        "full_model_response": "<answer>Sophia is a egoist, and Oliver is a egoist</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('lying', 1)), ('<=>', ('lying', 0), ('telling-truth', 1))), 'solution': (False, False), 'names': [np.str_('Sophia'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Riley, and Lily. Riley commented, \"Riley is an altruist if and only if Lily is an altruist\". In Lily's words: \"Riley is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Riley is a altruist/egoist, and Lily is a altruist/egoist\")",
+        "expected_answer": "Riley is an egoist, and Lily is an altruist.",
+        "model_answer": "Riley is a egoist, and Lily is a altruist",
+        "full_model_response": "<answer>Riley is a egoist, and Lily is a altruist</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('telling-truth', 1)), ('not', ('telling-truth', 0))), 'solution': (False, True), 'names': [np.str_('Riley'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
+      },
+      {
+        "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Abigail, and Lily. \"Lily is not a fool,\" Abigail mentioned. As Lily put it, \"Abigail is a fool or Abigail is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Abigail is a sage/fool, and Lily is a sage/fool\")",
+        "expected_answer": "Abigail is a sage, and Lily is a sage.",
+        "model_answer": "Abigail is a sage, and Lily is a sage",
+        "full_model_response": "<answer>Abigail is a sage, and Lily is a sage</answer>",
+        "score": 1.0,
+        "metadata": "{'statements': (('not', ('lying', 1)), ('or', ('lying', 0), ('telling-truth', 0))), 'solution': (True, True), 'names': [np.str_('Abigail'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
+      }
+    ]
+  }
+]
diff --git a/eval/r1/yaml/test.yaml b/eval/r1/yaml/test.yaml
index b6956a2a..35ca90f1 100644
--- a/eval/r1/yaml/test.yaml
+++ b/eval/r1/yaml/test.yaml
@@ -1,7 +1,7 @@
 model: deepseek/deepseek-r1
 category: test
 datasets:
-  - YOUR_DATASET_NAME
+  - knights_knaves
 eval_dir: eval/r1
 dataset_size: 10
 dataset_seed: 42
diff --git a/reasoning_gym/logic/__init__.py b/reasoning_gym/logic/__init__.py
index db976fba..422e0235 100644
--- a/reasoning_gym/logic/__init__.py
+++ b/reasoning_gym/logic/__init__.py
@@ -4,6 +4,7 @@
 
 from .aiw import AliceInWonderlandConfig, AliceInWonderlandDataset
 from .circuit_logic import CircuitLogicConfig, CircuitLogicDataset
+from .knights_knaves import KnightsKnavesConfig, KnightsKnavesDataset
 from .propositional_logic import PropositionalLogicConfig, PropositionalLogicDataset
 from .self_reference import SelfReferenceConfig, SelfReferenceDataset
 from .syllogisms import SyllogismConfig, SyllogismDataset
@@ -24,4 +25,6 @@
     "SelfReferenceDataset",
     "CircuitLogicConfig",
     "CircuitLogicDataset",
+    "KnightsKnavesConfig",
+    "KnightsKnavesDataset",
 ]
diff --git a/reasoning_gym/logic/knights_knaves.py b/reasoning_gym/logic/knights_knaves.py
index bdb83c14..aa1bcfda 100644
--- a/reasoning_gym/logic/knights_knaves.py
+++ b/reasoning_gym/logic/knights_knaves.py
@@ -408,10 +408,11 @@ def __generate_problem(self, rng: Random) -> dict[str, Any]:
 
         return {"question": question, "answer": answer, "metadata": metadata}
 
-    def _normalize_answer(self, answer: str) -> set[tuple[str, str]]:
+    @staticmethod
+    def _normalize_answer(answer: str) -> set[tuple[str, str]]:
         """Convert answer string into normalized set of (name, role) tuples"""
         # Remove common punctuation and standardize spacing
-        answer = answer.lower().strip().replace(".", "").replace(",", "")
+        answer = answer.lower().strip().replace(".", " ").replace(",", " ").replace(")", " ").replace("(", " ")
 
         # Split on 'and' or spaces for different formats
         parts = [p.strip() for p in answer.replace(" and ", " ").split()]
@@ -421,7 +422,7 @@ def _normalize_answer(self, answer: str) -> set[tuple[str, str]]:
         current_name = None
 
         for part in parts:
-            if part in ["is", "a"]:
+            if part in ["is", "a", "an"]:
                 continue
             if part in VALID_ROLES:
                 if current_name:
@@ -440,6 +441,7 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         try:
             oracle_assignments = self._normalize_answer(entry["answer"])
             answer_assignments = self._normalize_answer(answer)
+            print(answer_assignments)
 
             # Full credit for exact assignments regardless of order
             if oracle_assignments == answer_assignments:
diff --git a/tests/test_knights_knaves.py b/tests/test_knights_knaves.py
new file mode 100644
index 00000000..fe539c0a
--- /dev/null
+++ b/tests/test_knights_knaves.py
@@ -0,0 +1,187 @@
+import pytest
+
+from reasoning_gym.logic.knights_knaves import KnightsKnavesConfig, KnightsKnavesDataset
+
+
+def test_config_validation():
+    """Test that invalid configs raise appropriate errors"""
+    with pytest.raises(AssertionError):
+        config = KnightsKnavesConfig(n_people=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = KnightsKnavesConfig(depth_constraint=0)
+        config.validate()
+
+    with pytest.raises(AssertionError):
+        config = KnightsKnavesConfig(width_constraint=0)
+        config.validate()
+
+
+def test_deterministic():
+    """Test that dataset generates same items with same seed"""
+    config = KnightsKnavesConfig(seed=42, size=10)
+    dataset1 = KnightsKnavesDataset(config)
+    dataset2 = KnightsKnavesDataset(config)
+
+    for i in range(len(dataset1)):
+        assert dataset1[i] == dataset2[i]
+
+
+def test_items():
+    """Test basic properties of generated items"""
+    config = KnightsKnavesConfig(
+        n_people=2,
+        depth_constraint=2,
+        width_constraint=2,
+        size=100,
+        seed=42,
+    )
+    dataset = KnightsKnavesDataset(config)
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+
+def test_solution():
+    config = KnightsKnavesConfig(
+        n_people=2,
+        depth_constraint=2,
+        width_constraint=2,
+        size=100,
+        seed=42,
+    )
+    dataset = KnightsKnavesDataset(config)
+
+    problem = dataset[0]
+    solutions = KnightsKnavesDataset.find_solution(problem["metadata"]["statements"])
+    assert len(solutions) == 1, "Should have exactly one solution"
+    assert solutions[0] == problem["metadata"]["solution"], "find_solution should match metadata solution"
+
+
+def test_specific_problem():
+    """Test a specific problem from the dataset"""
+    test_statements = (
+        ("or", ("telling-truth", 3), ("telling-truth", 2)),
+        ("not", ("telling-truth", 2)),
+        ("->", ("lying", 0), ("telling-truth", 3)),
+        ("->", ("lying", 1), ("lying", 4)),
+        ("not", ("lying", 0)),
+    )
+    test_solutions = KnightsKnavesDataset.find_solution(test_statements)
+    assert test_solutions == [
+        (True, False, True, False, True)
+    ], "Solution should be [(True, False, True, False, True)] for test example"
+
+
+def test_score_answer():
+    config = KnightsKnavesConfig(
+        n_people=2,
+        depth_constraint=2,
+        width_constraint=2,
+        size=100,
+        seed=42,
+    )
+    dataset = KnightsKnavesDataset(config)
+    problem = dataset[0]
+    correct_answer = problem["answer"]  ## 'Zoey is a fool, and Riley is a sage.'
+    print(correct_answer)
+    half_answer = "Zoey is a fool and Riley is a fool."
+    wrong_answer = "Zoey sage Riley fool"
+    modified_answer = "(Zoey, fool), (Riley, sage)"
+    flipped_answer = "(Riley,sage), (Zoey,fool)"
+
+    assert dataset.score_answer(correct_answer, problem) == 1.0
+    assert abs(dataset.score_answer(half_answer, problem) - 0.65) < 1e-10
+    assert dataset.score_answer(modified_answer, problem) == 1.0
+    assert dataset.score_answer(wrong_answer, problem) == 0.01
+    print("flipped")
+    assert dataset.score_answer(flipped_answer, problem) == 1.0
+
+
+def test_people_count():
+    """Test that different parameters generate different problems"""
+    config_more_people = KnightsKnavesConfig(
+        n_people=4,
+        depth_constraint=2,
+        width_constraint=2,
+        size=100,
+        seed=42,
+    )
+    dataset_more_people = KnightsKnavesDataset(config_more_people)
+    item = dataset_more_people[0]
+    assert len(item["metadata"]["names"]) == 4
+
+
+def test_zero_multiple_solutions():
+    no_solution_statements = (("telling-truth", 1), ("lying", 0))
+    solutions = KnightsKnavesDataset.find_solution(no_solution_statements)
+    assert len(solutions) == 0, "Should have no solutions for contradictory statements"
+
+    multiple_solutions_statements = (("telling-truth", 1), ("telling-truth", 0))
+    solutions = KnightsKnavesDataset.find_solution(multiple_solutions_statements)
+    assert len(solutions) == 2, "Should have two solutions for consistent statements"
+
+
+def test_invalid_statements():
+    """Test handling of invalid statements"""
+    with pytest.raises(KeyError):
+        KnightsKnavesDataset.test_satisfiability(("invalid-operator", 0), (True,))
+
+
+def test_normalize_answer():
+    """Test normalization of answer strings"""
+    answer1 = "Zoey is a fool, and Riley is a sage."
+    answer2 = "Zoey fool, Riley sage"
+    answer3 = "(Zoey, fool), (Riley, sage)"
+    answer4 = "(Riley, sage), (Zoey, fool)"
+    normalized1 = KnightsKnavesDataset._normalize_answer(answer1)
+    assert normalized1 == {("zoey", "fool"), ("riley", "sage")}
+    normalized2 = KnightsKnavesDataset._normalize_answer(answer2)
+    assert normalized2 == {("zoey", "fool"), ("riley", "sage")}
+    normalized3 = KnightsKnavesDataset._normalize_answer(answer3)
+    assert normalized3 == {("zoey", "fool"), ("riley", "sage")}
+    normalized4 = KnightsKnavesDataset._normalize_answer(answer4)
+    assert normalized4 == {("zoey", "fool"), ("riley", "sage")}
+
+
+def test_satisfiability():
+    """Test the test_satisfiability method with different logical operators"""
+    # Test basic operators
+    assert KnightsKnavesDataset.test_satisfiability(("telling-truth", 0), (True,))
+    assert not KnightsKnavesDataset.test_satisfiability(("telling-truth", 0), (False,))
+    assert not KnightsKnavesDataset.test_satisfiability(("lying", 0), (True,))
+    assert KnightsKnavesDataset.test_satisfiability(("lying", 0), (False,))
+
+    # Test NOT
+    assert not KnightsKnavesDataset.test_satisfiability(("not", ("telling-truth", 0)), (True,))
+    assert KnightsKnavesDataset.test_satisfiability(("not", ("telling-truth", 0)), (False,))
+
+    # Test AND
+    assert KnightsKnavesDataset.test_satisfiability(("and", ("telling-truth", 0), ("telling-truth", 1)), (True, True))
+    assert not KnightsKnavesDataset.test_satisfiability(
+        ("and", ("telling-truth", 0), ("telling-truth", 1)), (True, False)
+    )
+
+    # Test OR
+    assert KnightsKnavesDataset.test_satisfiability(("or", ("telling-truth", 0), ("telling-truth", 1)), (True, False))
+    assert not KnightsKnavesDataset.test_satisfiability(
+        ("or", ("telling-truth", 0), ("telling-truth", 1)), (False, False)
+    )
+
+    # Test implication
+    assert KnightsKnavesDataset.test_satisfiability(("->", ("telling-truth", 0), ("telling-truth", 1)), (False, False))
+    assert not KnightsKnavesDataset.test_satisfiability(
+        ("->", ("telling-truth", 0), ("telling-truth", 1)), (True, False)
+    )
+
+    # Test bi-implication
+    assert KnightsKnavesDataset.test_satisfiability(("<=>", ("telling-truth", 0), ("telling-truth", 1)), (True, True))
+    assert KnightsKnavesDataset.test_satisfiability(("<=>", ("telling-truth", 0), ("telling-truth", 1)), (False, False))
+    assert not KnightsKnavesDataset.test_satisfiability(
+        ("<=>", ("telling-truth", 0), ("telling-truth", 1)), (True, False)
+    )

From 5e1d0c058498209709916415542af94ca650b674 Mon Sep 17 00:00:00 2001
From: vncntt <provincent8412@gmail.com>
Date: Tue, 25 Feb 2025 06:07:54 -0800
Subject: [PATCH 4/9] remove r1 tests

---
 eval/r1/test/knights_knaves.json |  98 ------------------------------
 eval/r1/test/summary.json        | 100 -------------------------------
 2 files changed, 198 deletions(-)
 delete mode 100644 eval/r1/test/knights_knaves.json
 delete mode 100644 eval/r1/test/summary.json

diff --git a/eval/r1/test/knights_knaves.json b/eval/r1/test/knights_knaves.json
deleted file mode 100644
index 79ea045d..00000000
--- a/eval/r1/test/knights_knaves.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "dataset_name": "knights_knaves",
-  "model": "deepseek/deepseek-r1",
-  "size": 10,
-  "provider": "Nebius",
-  "average_score": 1.0,
-  "total_examples": 10,
-  "timestamp": "2025-02-24T20:50:44.558572",
-  "config": {
-    "n_people": 2,
-    "depth_constraint": 2,
-    "width_constraint": 2,
-    "size": 10,
-    "seed": 42
-  },
-  "results": [
-    {
-      "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Zoey, and Riley. Zoey commented, \"Riley is a fool\". In Riley's words: \"Zoey is a sage or Riley is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Zoey is a sage/fool, and Riley is a sage/fool\")",
-      "expected_answer": "Zoey is a fool, and Riley is a sage.",
-      "model_answer": "Zoey is a fool, and Riley is a sage",
-      "full_model_response": "<answer>Zoey is a fool, and Riley is a sage</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('lying', 1), ('or', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Zoey'), np.str_('Riley')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Riley, and Olivia. \"If Riley is a pioneer then Olivia is a laggard\" - Riley. Olivia stated, \"Olivia is a pioneer and Riley is a laggard\". So who is a pioneer and who is a laggard? (Format your answer like: \"Riley is a pioneer/laggard, and Olivia is a pioneer/laggard\")",
-      "expected_answer": "Riley is a pioneer, and Olivia is a laggard.",
-      "model_answer": "Riley is a pioneer, and Olivia is a laggard",
-      "full_model_response": "<answer>Riley is a pioneer, and Olivia is a laggard</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('and', ('telling-truth', 1), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Riley'), np.str_('Olivia')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Samuel, and Jacob. Samuel expressed that If Samuel is a saint then Jacob is a sinner. Jacob was heard saying, \"If Samuel is a saint then Samuel is a sinner\". So who is a saint and who is a sinner? (Format your answer like: \"Samuel is a saint/sinner, and Jacob is a saint/sinner\")",
-      "expected_answer": "Samuel is a saint, and Jacob is a sinner.",
-      "model_answer": "Samuel is a saint, and Jacob is a sinner",
-      "full_model_response": "<answer>Samuel is a saint, and Jacob is a sinner</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('->', ('telling-truth', 0), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Samuel'), np.str_('Jacob')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Olivia, and Lily. \"Lily is a laggard and Lily is a pioneer\" - Olivia. Lily noted, \"If Olivia is a pioneer then Lily is a pioneer\". So who is a pioneer and who is a laggard? (Format your answer like: \"Olivia is a pioneer/laggard, and Lily is a pioneer/laggard\")",
-      "expected_answer": "Olivia is a laggard, and Lily is a pioneer.",
-      "model_answer": "Olivia is a laggard, and Lily is a pioneer",
-      "full_model_response": "<answer>Olivia is a laggard, and Lily is a pioneer</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('and', ('lying', 1), ('telling-truth', 1)), ('->', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Olivia'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Mason, and Jack. Mason expressed that If Jack is an egoist then Mason is an altruist. In Jack's words: \"Mason is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Mason is a altruist/egoist, and Jack is a altruist/egoist\")",
-      "expected_answer": "Mason is an altruist, and Jack is an egoist.",
-      "model_answer": "Mason is a altruist, and Jack is a egoist",
-      "full_model_response": "<answer>Mason is a altruist, and Jack is a egoist</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('->', ('lying', 1), ('telling-truth', 0)), ('not', ('telling-truth', 0))), 'solution': (True, False), 'names': [np.str_('Mason'), np.str_('Jack')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by heros and villains. Heros always tell the truth, and villains always lie. You meet 2 inhabitants: Benjamin, and Oliver. Benjamin noted, \"Oliver is a villain or Benjamin is a hero\". Oliver commented, \"If Oliver is a hero then Benjamin is a villain\". So who is a hero and who is a villain? (Format your answer like: \"Benjamin is a hero/villain, and Oliver is a hero/villain\")",
-      "expected_answer": "Benjamin is a villain, and Oliver is a hero.",
-      "model_answer": "Benjamin is a villain, and Oliver is a hero",
-      "full_model_response": "<answer>Benjamin is a villain, and Oliver is a hero</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('or', ('lying', 1), ('telling-truth', 0)), ('->', ('telling-truth', 1), ('lying', 0))), 'solution': (False, True), 'names': [np.str_('Benjamin'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'hero', 'knave': 'villain', 'a_knight': np.str_('a hero'), 'a_knave': np.str_('a villain'), 'Knight': 'Hero', 'Knave': 'Villain'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Alexander, and Sofia. Alexander expressed that Sofia is a saint and Sofia is a sinner. \"Alexander is not a sinner,\" Sofia claimed. So who is a saint and who is a sinner? (Format your answer like: \"Alexander is a saint/sinner, and Sofia is a saint/sinner\")",
-      "expected_answer": "Alexander is a sinner, and Sofia is a sinner.",
-      "model_answer": "Alexander is a sinner, and Sofia is a sinner",
-      "full_model_response": "<answer>Alexander is a sinner, and Sofia is a sinner</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('and', ('telling-truth', 1), ('lying', 1)), ('not', ('lying', 0))), 'solution': (False, False), 'names': [np.str_('Alexander'), np.str_('Sofia')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Sophia, and Oliver. Sophia said that Sophia is an altruist if and only if Oliver is an egoist. Oliver noted, \"Sophia is an egoist if and only if Oliver is an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Sophia is a altruist/egoist, and Oliver is a altruist/egoist\")",
-      "expected_answer": "Sophia is an egoist, and Oliver is an egoist.",
-      "model_answer": "Sophia is a egoist, and Oliver is a egoist",
-      "full_model_response": "<answer>Sophia is a egoist, and Oliver is a egoist</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('lying', 1)), ('<=>', ('lying', 0), ('telling-truth', 1))), 'solution': (False, False), 'names': [np.str_('Sophia'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Riley, and Lily. Riley commented, \"Riley is an altruist if and only if Lily is an altruist\". In Lily's words: \"Riley is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Riley is a altruist/egoist, and Lily is a altruist/egoist\")",
-      "expected_answer": "Riley is an egoist, and Lily is an altruist.",
-      "model_answer": "Riley is a egoist, and Lily is a altruist",
-      "full_model_response": "<answer>Riley is a egoist, and Lily is a altruist</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('telling-truth', 1)), ('not', ('telling-truth', 0))), 'solution': (False, True), 'names': [np.str_('Riley'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
-    },
-    {
-      "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Abigail, and Lily. \"Lily is not a fool,\" Abigail mentioned. As Lily put it, \"Abigail is a fool or Abigail is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Abigail is a sage/fool, and Lily is a sage/fool\")",
-      "expected_answer": "Abigail is a sage, and Lily is a sage.",
-      "model_answer": "Abigail is a sage, and Lily is a sage",
-      "full_model_response": "<answer>Abigail is a sage, and Lily is a sage</answer>",
-      "score": 1.0,
-      "metadata": "{'statements': (('not', ('lying', 1)), ('or', ('lying', 0), ('telling-truth', 0))), 'solution': (True, True), 'names': [np.str_('Abigail'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
-    }
-  ]
-}
diff --git a/eval/r1/test/summary.json b/eval/r1/test/summary.json
deleted file mode 100644
index f89fb259..00000000
--- a/eval/r1/test/summary.json
+++ /dev/null
@@ -1,100 +0,0 @@
-[
-  {
-    "dataset_name": "knights_knaves",
-    "model": "deepseek/deepseek-r1",
-    "size": 10,
-    "provider": "Nebius",
-    "average_score": 1.0,
-    "total_examples": 10,
-    "timestamp": "2025-02-24T20:50:44.558572",
-    "config": {
-      "n_people": 2,
-      "depth_constraint": 2,
-      "width_constraint": 2,
-      "size": 10,
-      "seed": 42
-    },
-    "results": [
-      {
-        "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Zoey, and Riley. Zoey commented, \"Riley is a fool\". In Riley's words: \"Zoey is a sage or Riley is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Zoey is a sage/fool, and Riley is a sage/fool\")",
-        "expected_answer": "Zoey is a fool, and Riley is a sage.",
-        "model_answer": "Zoey is a fool, and Riley is a sage",
-        "full_model_response": "<answer>Zoey is a fool, and Riley is a sage</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('lying', 1), ('or', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Zoey'), np.str_('Riley')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Riley, and Olivia. \"If Riley is a pioneer then Olivia is a laggard\" - Riley. Olivia stated, \"Olivia is a pioneer and Riley is a laggard\". So who is a pioneer and who is a laggard? (Format your answer like: \"Riley is a pioneer/laggard, and Olivia is a pioneer/laggard\")",
-        "expected_answer": "Riley is a pioneer, and Olivia is a laggard.",
-        "model_answer": "Riley is a pioneer, and Olivia is a laggard",
-        "full_model_response": "<answer>Riley is a pioneer, and Olivia is a laggard</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('and', ('telling-truth', 1), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Riley'), np.str_('Olivia')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Samuel, and Jacob. Samuel expressed that If Samuel is a saint then Jacob is a sinner. Jacob was heard saying, \"If Samuel is a saint then Samuel is a sinner\". So who is a saint and who is a sinner? (Format your answer like: \"Samuel is a saint/sinner, and Jacob is a saint/sinner\")",
-        "expected_answer": "Samuel is a saint, and Jacob is a sinner.",
-        "model_answer": "Samuel is a saint, and Jacob is a sinner",
-        "full_model_response": "<answer>Samuel is a saint, and Jacob is a sinner</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('->', ('telling-truth', 0), ('lying', 1)), ('->', ('telling-truth', 0), ('lying', 0))), 'solution': (True, False), 'names': [np.str_('Samuel'), np.str_('Jacob')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by pioneers and laggards. Pioneers always tell the truth, and laggards always lie. You meet 2 inhabitants: Olivia, and Lily. \"Lily is a laggard and Lily is a pioneer\" - Olivia. Lily noted, \"If Olivia is a pioneer then Lily is a pioneer\". So who is a pioneer and who is a laggard? (Format your answer like: \"Olivia is a pioneer/laggard, and Lily is a pioneer/laggard\")",
-        "expected_answer": "Olivia is a laggard, and Lily is a pioneer.",
-        "model_answer": "Olivia is a laggard, and Lily is a pioneer",
-        "full_model_response": "<answer>Olivia is a laggard, and Lily is a pioneer</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('and', ('lying', 1), ('telling-truth', 1)), ('->', ('telling-truth', 0), ('telling-truth', 1))), 'solution': (False, True), 'names': [np.str_('Olivia'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'pioneer', 'knave': 'laggard', 'a_knight': np.str_('a pioneer'), 'a_knave': np.str_('a laggard'), 'Knight': 'Pioneer', 'Knave': 'Laggard'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Mason, and Jack. Mason expressed that If Jack is an egoist then Mason is an altruist. In Jack's words: \"Mason is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Mason is a altruist/egoist, and Jack is a altruist/egoist\")",
-        "expected_answer": "Mason is an altruist, and Jack is an egoist.",
-        "model_answer": "Mason is a altruist, and Jack is a egoist",
-        "full_model_response": "<answer>Mason is a altruist, and Jack is a egoist</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('->', ('lying', 1), ('telling-truth', 0)), ('not', ('telling-truth', 0))), 'solution': (True, False), 'names': [np.str_('Mason'), np.str_('Jack')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by heros and villains. Heros always tell the truth, and villains always lie. You meet 2 inhabitants: Benjamin, and Oliver. Benjamin noted, \"Oliver is a villain or Benjamin is a hero\". Oliver commented, \"If Oliver is a hero then Benjamin is a villain\". So who is a hero and who is a villain? (Format your answer like: \"Benjamin is a hero/villain, and Oliver is a hero/villain\")",
-        "expected_answer": "Benjamin is a villain, and Oliver is a hero.",
-        "model_answer": "Benjamin is a villain, and Oliver is a hero",
-        "full_model_response": "<answer>Benjamin is a villain, and Oliver is a hero</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('or', ('lying', 1), ('telling-truth', 0)), ('->', ('telling-truth', 1), ('lying', 0))), 'solution': (False, True), 'names': [np.str_('Benjamin'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'hero', 'knave': 'villain', 'a_knight': np.str_('a hero'), 'a_knave': np.str_('a villain'), 'Knight': 'Hero', 'Knave': 'Villain'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by saints and sinners. Saints always tell the truth, and sinners always lie. You meet 2 inhabitants: Alexander, and Sofia. Alexander expressed that Sofia is a saint and Sofia is a sinner. \"Alexander is not a sinner,\" Sofia claimed. So who is a saint and who is a sinner? (Format your answer like: \"Alexander is a saint/sinner, and Sofia is a saint/sinner\")",
-        "expected_answer": "Alexander is a sinner, and Sofia is a sinner.",
-        "model_answer": "Alexander is a sinner, and Sofia is a sinner",
-        "full_model_response": "<answer>Alexander is a sinner, and Sofia is a sinner</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('and', ('telling-truth', 1), ('lying', 1)), ('not', ('lying', 0))), 'solution': (False, False), 'names': [np.str_('Alexander'), np.str_('Sofia')], 'knight_knave_terms': {'knight': 'saint', 'knave': 'sinner', 'a_knight': np.str_('a saint'), 'a_knave': np.str_('a sinner'), 'Knight': 'Saint', 'Knave': 'Sinner'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Sophia, and Oliver. Sophia said that Sophia is an altruist if and only if Oliver is an egoist. Oliver noted, \"Sophia is an egoist if and only if Oliver is an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Sophia is a altruist/egoist, and Oliver is a altruist/egoist\")",
-        "expected_answer": "Sophia is an egoist, and Oliver is an egoist.",
-        "model_answer": "Sophia is a egoist, and Oliver is a egoist",
-        "full_model_response": "<answer>Sophia is a egoist, and Oliver is a egoist</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('lying', 1)), ('<=>', ('lying', 0), ('telling-truth', 1))), 'solution': (False, False), 'names': [np.str_('Sophia'), np.str_('Oliver')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by altruists and egoists. Altruists always tell the truth, and egoists always lie. You meet 2 inhabitants: Riley, and Lily. Riley commented, \"Riley is an altruist if and only if Lily is an altruist\". In Lily's words: \"Riley is not an altruist\". So who is an altruist and who is an egoist? (Format your answer like: \"Riley is a altruist/egoist, and Lily is a altruist/egoist\")",
-        "expected_answer": "Riley is an egoist, and Lily is an altruist.",
-        "model_answer": "Riley is a egoist, and Lily is a altruist",
-        "full_model_response": "<answer>Riley is a egoist, and Lily is a altruist</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('<=>', ('telling-truth', 0), ('telling-truth', 1)), ('not', ('telling-truth', 0))), 'solution': (False, True), 'names': [np.str_('Riley'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'altruist', 'knave': 'egoist', 'a_knight': np.str_('an altruist'), 'a_knave': np.str_('an egoist'), 'Knight': 'Altruist', 'Knave': 'Egoist'}}"
-      },
-      {
-        "question": "A very special island is inhabited only by sages and fools. Sages always tell the truth, and fools always lie. You meet 2 inhabitants: Abigail, and Lily. \"Lily is not a fool,\" Abigail mentioned. As Lily put it, \"Abigail is a fool or Abigail is a sage\". So who is a sage and who is a fool? (Format your answer like: \"Abigail is a sage/fool, and Lily is a sage/fool\")",
-        "expected_answer": "Abigail is a sage, and Lily is a sage.",
-        "model_answer": "Abigail is a sage, and Lily is a sage",
-        "full_model_response": "<answer>Abigail is a sage, and Lily is a sage</answer>",
-        "score": 1.0,
-        "metadata": "{'statements': (('not', ('lying', 1)), ('or', ('lying', 0), ('telling-truth', 0))), 'solution': (True, True), 'names': [np.str_('Abigail'), np.str_('Lily')], 'knight_knave_terms': {'knight': 'sage', 'knave': 'fool', 'a_knight': np.str_('a sage'), 'a_knave': np.str_('a fool'), 'Knight': 'Sage', 'Knave': 'Fool'}}"
-      }
-    ]
-  }
-]

From 66c10c095b651d374bec8dd30005d3eabeedcb5d Mon Sep 17 00:00:00 2001
From: vncntt <provincent8412@gmail.com>
Date: Tue, 25 Feb 2025 06:49:28 -0800
Subject: [PATCH 5/9] fix test.yaml

---
 eval/r1/yaml/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval/r1/yaml/test.yaml b/eval/r1/yaml/test.yaml
index 35ca90f1..b6956a2a 100644
--- a/eval/r1/yaml/test.yaml
+++ b/eval/r1/yaml/test.yaml
@@ -1,7 +1,7 @@
 model: deepseek/deepseek-r1
 category: test
 datasets:
-  - knights_knaves
+  - YOUR_DATASET_NAME
 eval_dir: eval/r1
 dataset_size: 10
 dataset_seed: 42

From b8e7d54a8eb2e7ced1ed95627ae89486397b1bcc Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Tue, 25 Feb 2025 20:01:40 +0100
Subject: [PATCH 6/9] docs: Add attribution for Knights and Knaves dataset
 implementation

---
 NOTICE.txt                            | 12 ++++++++++++
 reasoning_gym/logic/knights_knaves.py | 14 ++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/NOTICE.txt b/NOTICE.txt
index 6d8be4db..250f0b01 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -60,6 +60,18 @@ or served as inspiration for parts of our codebase:
 - License: MIT
 - Copyright (c) 2024 AlphaPav
 - Note: Adapted for K&K problem generation
+- Citation:
+```
+@article{xie2024memorization,
+title={On Memorization of Large Language Models in Logical Reasoning}, 
+author={Chulin Xie and Yangsibo Huang and Chiyuan Zhang and Da Yu and Xinyun Chen and Bill Yuchen Lin and Bo Li and Badih Ghazi and Ravi Kumar},
+year={2024},
+eprint={2410.23123},
+archivePrefix={arXiv},
+primaryClass={cs.CL},
+url={https://arxiv.org/abs/2410.23123}, 
+}
+```
 
 We are deeply grateful to all authors of these projects for making their work available
 under open source licenses, enabling us to build upon their contributions.
diff --git a/reasoning_gym/logic/knights_knaves.py b/reasoning_gym/logic/knights_knaves.py
index aa1bcfda..ce05f9d3 100644
--- a/reasoning_gym/logic/knights_knaves.py
+++ b/reasoning_gym/logic/knights_knaves.py
@@ -311,6 +311,20 @@ def _format_knight_knave(self, names, knight_knave, statement, negation=False):
 class KnightsKnavesDataset(ProceduralDataset):
     """
     Generates random knights and knaves problems.
+    
+    This implementation is adapted from the Knights and Knaves problem generator in:
+    https://github.com/AlphaPav/mem-kk-logic
+    
+    As described in the paper:
+    @article{xie2024memorization,
+    title={On Memorization of Large Language Models in Logical Reasoning}, 
+    author={Chulin Xie and Yangsibo Huang and Chiyuan Zhang and Da Yu and Xinyun Chen and Bill Yuchen Lin and Bo Li and Badih Ghazi and Ravi Kumar},
+    year={2024},
+    eprint={2410.23123},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL},
+    url={https://arxiv.org/abs/2410.23123}, 
+    }
     """
 
     def __init__(self, config: KnightsKnavesConfig):

From 58015b6a94aa7e901be1e0e5bdc7d34227ecd950 Mon Sep 17 00:00:00 2001
From: "Andreas Koepf (aider)" <andreas.koepf@provisio.com>
Date: Tue, 25 Feb 2025 20:03:05 +0100
Subject: [PATCH 7/9] style: Format citation in NOTICE.txt for consistent
 spacing

---
 NOTICE.txt                            | 14 +++++++-------
 reasoning_gym/logic/knights_knaves.py |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/NOTICE.txt b/NOTICE.txt
index 250f0b01..c6efe31e 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -63,13 +63,13 @@ or served as inspiration for parts of our codebase:
 - Citation:
 ```
 @article{xie2024memorization,
-title={On Memorization of Large Language Models in Logical Reasoning}, 
-author={Chulin Xie and Yangsibo Huang and Chiyuan Zhang and Da Yu and Xinyun Chen and Bill Yuchen Lin and Bo Li and Badih Ghazi and Ravi Kumar},
-year={2024},
-eprint={2410.23123},
-archivePrefix={arXiv},
-primaryClass={cs.CL},
-url={https://arxiv.org/abs/2410.23123}, 
+      title={On Memorization of Large Language Models in Logical Reasoning},
+      author={Chulin Xie and Yangsibo Huang and Chiyuan Zhang and Da Yu and Xinyun Chen and Bill Yuchen Lin and Bo Li and Badih Ghazi and Ravi Kumar},
+      year={2024},
+      eprint={2410.23123},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2410.23123},
 }
 ```
 
diff --git a/reasoning_gym/logic/knights_knaves.py b/reasoning_gym/logic/knights_knaves.py
index ce05f9d3..ed506829 100644
--- a/reasoning_gym/logic/knights_knaves.py
+++ b/reasoning_gym/logic/knights_knaves.py
@@ -311,19 +311,19 @@ def _format_knight_knave(self, names, knight_knave, statement, negation=False):
 class KnightsKnavesDataset(ProceduralDataset):
     """
     Generates random knights and knaves problems.
-    
+
     This implementation is adapted from the Knights and Knaves problem generator in:
     https://github.com/AlphaPav/mem-kk-logic
-    
+
     As described in the paper:
     @article{xie2024memorization,
-    title={On Memorization of Large Language Models in Logical Reasoning}, 
+    title={On Memorization of Large Language Models in Logical Reasoning},
     author={Chulin Xie and Yangsibo Huang and Chiyuan Zhang and Da Yu and Xinyun Chen and Bill Yuchen Lin and Bo Li and Badih Ghazi and Ravi Kumar},
     year={2024},
     eprint={2410.23123},
     archivePrefix={arXiv},
     primaryClass={cs.CL},
-    url={https://arxiv.org/abs/2410.23123}, 
+    url={https://arxiv.org/abs/2410.23123},
     }
     """
 

From bc270e6e6a5ed45df26135ab2c41279f80f3030e Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Tue, 25 Feb 2025 20:08:59 +0100
Subject: [PATCH 8/9] remove print statement

---
 reasoning_gym/logic/knights_knaves.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/reasoning_gym/logic/knights_knaves.py b/reasoning_gym/logic/knights_knaves.py
index ed506829..d497ab8f 100644
--- a/reasoning_gym/logic/knights_knaves.py
+++ b/reasoning_gym/logic/knights_knaves.py
@@ -455,7 +455,6 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         try:
             oracle_assignments = self._normalize_answer(entry["answer"])
             answer_assignments = self._normalize_answer(answer)
-            print(answer_assignments)
 
             # Full credit for exact assignments regardless of order
             if oracle_assignments == answer_assignments:

From 8d5f6b1f4981c3c9d1f837950afc98c987bc7dd8 Mon Sep 17 00:00:00 2001
From: Andreas Koepf <andreas.koepf@provisio.com>
Date: Tue, 25 Feb 2025 20:15:18 +0100
Subject: [PATCH 9/9] minor formatting

---
 reasoning_gym/logic/knights_knaves.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reasoning_gym/logic/knights_knaves.py b/reasoning_gym/logic/knights_knaves.py
index d497ab8f..fa3d4279 100644
--- a/reasoning_gym/logic/knights_knaves.py
+++ b/reasoning_gym/logic/knights_knaves.py
@@ -111,7 +111,7 @@ class KnightsKnavesConfig:
     :param n_people: Number of people in the problem
     :param depth_constraint: Maximum depth of each person's statement
     :param width_constraint: Maximum width (number of branches) of each person's statement
-    :param size: Number of items in the dataset ??????
+    :param size: Virtual size of dataset
     :param seed: Random seed
     """