diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c8a046 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +node_modules +typings +*.pyc +.DS_Store +package-lock.json +mtp-ai-turing-tumble.iml +.idea/ +out +reinforcement_learning/wandb +reinforcement_learning/tmp +/out/ +/reinforcement_learning/tmp/ +.idea/* +reinforcement_learning/wandb/* +out/* +reinforcement_learning/tmp/* +*/META-INF/* +MANIFEST.MF +wandb_key_file +LogFile.txt +State.txt +/reinforcement_learning/dataset_generators/rl_training_set.csv +/reinforcement_learning/environments/envs/bugbit_env_backup.py diff --git a/LICENSE b/LICENSE index 7c8f7c8..b621ac1 100644 --- a/LICENSE +++ b/LICENSE @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d45d938 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# European Master Team Project - AI Turing Tumble + +> This repository holds the code that was developed during the European Master Team Project +> in the spring semester of 2022 (EMTP 22). The project was supervised by +> [Dr. Christian Bartelt](https://www.uni-mannheim.de/en/ines/about-us/researchers/dr-christian-bartelt/) and +> [Jannik Brinkmann](https://www.linkedin.com/in/brinkmann-jannik/). The project team was +> composed of students from the [Babeș-Bolyai University](https://www.ubbcluj.ro/en/) +> in Cluj-Napoca, Romania, and the [University of Mannheim](https://www.uni-mannheim.de/), Germany. + +## Introduction + +In the game Turing Tumble, players construct mechanical computers that use the flow of marbles along a board to solve +logic problems. As the board and its parts are Turing complete, which means that they can be used to express any +mathematical function, an intelligent agent taught to solve a Turing Tumble challenge essentially learns how to write +code according to a given specification. + +Following this logic, we taught an agent how to write a simple programme according to a minimal specification, using +an abstracted version of the Turing Tumble board as reinforcement learning training environment. This is related to +the emerging field of programme synthesis, as is for example applied in +[GitHub’s CoPilot](https://github.com/features/copilot). + +## Participants + +### Babeș-Bolyai University + +* [Tudor Esan](https://github.com/TudorEsan) - B.Sc. Computer Science +* [Raluca Diana Chis](https://github.com/RalucaChis) - M.Sc. Applied Computational Intelligence + +### University of Mannheim + +* [Roman Hess](https://github.com/romanhess98) - M.Sc. Data Science +* [Timur Carstensen](https://github.com/timurcarstensen) - M.Sc. Data Science +* [Julie Naegelen](https://github.com/jnaeg) - M.Sc. Data Science +* [Tobias Sesterhenn](https://github.com/Tsesterh) - M.Sc. Data Science + +## Contents of this repository + +The project directory is organised in the following way: + +| Path | Role | +|---------------------------|----------------------------------------------| +| `docs/` | Supporting material to document the project | +| `reinforcement_learning/` | Everything related to Reinforcement Learning | +| `src/` | Java sources | +| `ttsim/` | Source Code of the Turing Tumble Simulator | + +## Weights & Biases (wandb) +We used [Weights & Biases](https://wandb.ai/) to log the results of our training: +1. [Reinforcement Learning](https://wandb.ai/mtp-ai-board-game-engine/ray-tune-bugbit) +2. [Pretraining](https://wandb.ai/mtp-ai-board-game-engine/Pretraining) +3. [Connect Four](https://wandb.ai/mtp-ai-board-game-engine/connect-four) + +## Credits + +We used third-party software to implement the project. Namely: + +- **BugPlus** - [Dr. Christian Bartelt](https://www.uni-mannheim.de/en/ines/about-us/researchers/dr-christian-bartelt/) +- **Turing Tumble Simulator** - [Jesse Crossen](https://github.com/jessecrossen/ttsim) + +## Final Project Presentation + +Link to video: +[data:image/s3,"s3://crabby-images/5e4cf/5e4cfced39239be611a68d5fe622f9f96e48a119" alt="Final Project Presentation Video"](https://www.youtube.com/watch?v=w501gf2MLFM) + diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..4fdf31c --- /dev/null +++ b/docs/README.md @@ -0,0 +1,111 @@ +# Setup + +> **DISCLAIMER: this project is meant to be run on Linux or macOS machines. Windows is not supported +> due to a scheduler conflict between JPype and Ray.** + +## Prerequisites + +1. An x86 machine running Linux or macOS (tested on Ubuntu 20.04 and macOS Monterey) +2. A working, clean (i.e new / separate) conda ([miniconda3](https://docs.conda.io/en/latest/miniconda.html) + or [anaconda3](https://docs.anaconda.com/anaconda/install/)) installation +3. [IntelliJ IDEA](https://www.jetbrains.com/idea/) (CE / Ultimate) + +## Installing dependencies + +1. Create and activate a new conda environment for python3.8 (`conda create -n mtp python=3.8` & `conda activate mtp`) +2. Navigate to the project root and run `pip install -r requirements.txt` +3. Run `pip install "ray[all]"` to install all dependencies for Ray + +## Python Setup + +1. In IntelliJ, open the project structure dialogue: `File -> Project Structure` +2. In Modules, select the project and click `Add` and select `Python` +3. In the `Python` tab, add a new Python interpreter by clicking on `...` +4. In the newly opened dialogue, click on `+` and click `Add Python SDK...` +5. In the dialogue, click on `Conda environment` and select the existing environment we created in the previous step +6. Select the newly registered interpreter as the project interpreter and close out of the dialogue after + clicking `apply` +7. Still in `Project Structure`, navigate to `Modules` and select the project: select the directories `src` + and `reinforcement_learning` and mark them as `Sources`. Click `Apply` and close out of the dialogue. + +## Compiling the project + +1. Make sure that the SDK and Language Level in the Project tab are set to 17 (i.e. openjdk-17) +2. Open the Project Structure Dialogue in IntelliJ `File -> Project Structure` +3. Select `Artifacts` +4. Add a JAR file with dependencies +5. Click on the folder icon and select `CF_Translated` in the next dialogue and click OK +6. Click OK again and then in the artifacts overview, in the Output Layout tab, select the Python library and remove it +7. Click on apply and OK +8. Build the artifact: `Build -> Build Artifacts` +9. In `reinforcement_learning/utilities/utilities.py`, make sure that the variable `artifact_directory` is set to the + folder that contains the compiled artifact. The variable `artifact_file_name` should be set to the name of the jar + file. (cf. image below: `artifact_directory = mtp_testing_jar` and `artifact_file_name = mtp-testing.jar`) + +
+
+
+
+
+
+
+An exemplary challenge +
+ +#### The BugBit Environment + +BugBit is an abstraction of the Turing Tumble board without restrictions like board size or gravity, devised +by [Dr. Christian Bartelt](https://www.uni-mannheim.de/en/ines/about-us/researchers/dr-christian-bartelt/). +Bugs represent the blue Bits from the physical game. +The orange lines indicate the control flow along the BugBit programme, which equates the flow of the marbles along the +bits on the board. +Each Bug has a control-in pin at its top where the control flow (marble) can come in and two control-out pins at the +bottom, where it can leave. +If a Bug has internal state zero (the blue Bit is flipped to the left) the control flow will leave the Bug via the right +control +out pin. +If it has state one (flipped to the right) the control flow will leave it via the left control-out pin. + +
+
+
+An Illustration of a Bug. +The data-in and -out pin may be ignored, as they are only used internally to flip a Bit’s state after the control flow +(the marble) passes through it +
+ +Different Bugs can now be connected with each other to define more complex programmes. +For example, the following physical Turing Tumble board can be represented by setting the connections +between Bugs as shown below. + +
+
+
+
+A Turing Tumble Board and its BugBit abstraction +
+ +#### The control flow matrix + +How the Bugs are connected with each other is stored in the control flow matrix (CF matrix). +Visualised below is a CF matrix for two Bugs. Rows indicate the control-in pins of the Bits, +and columns indicate the control-out pins of the Bits. As each Bit has one control-in pin, but two control-out pins, +we have twice as many columns as rows. If a cell contains a 1 in this matrix, it means that the corresponding in and out +pins are +connected. By setting 1s in the CF matrix we can define a programme, which we can later run. + +
+
+
+The control flow matrix +
+ +#### Reducing the Problem Size + +The Turing Tumble Puzzle Book contains all kinds of challenges for the player. So the first step was to define a +specific +subset of the challenges to focus on. For the purpose of this project we limited ourselves to the following problem +class: + +1. *We do not allow loops* We treat the board as if the switches at the bottom did not exist, so a challenge has to be + solved with a single marble. +2. *The waterfall principle* While in the BugBit abstraction gravity does not matter, we only allow Bits’ control flows + to go into Bits with a higher indication number, i.e. Bits placed + below them on the physical Turing Tumble board. For example, Bit 1 can only be connected to Bit 2 and 3, Bit 2 can + only be + connected to Bit 3, and from Bit 3 the control flow can not go any further. + +This limitation reduces the complexity of valid programmes, making the reinforcement learning task more feasible. + +#### The Challenge – What the agent learns + +The observation space, i.e. what the agent perceives, is comprised of two components: a CF matrix and a specification. +As we reduced the problem size to challenges that follow the waterfall principle, the **CF matrix can indeed only have +non-zero entries in its lower triangular part**. Therefore, we can represent the CF matrix as a vector of length +*n2-n*, where n is the number of Bits. + +The specification consists of a set of input-output pairs, describing the Bit positions +(flipped to the left or to the right) before and after the execution of the programme represented by the CF matrix. +The following specification now defines that when we start our programme and both Bits are flipped to the left +(state 0,0), after the programme has completed, they should both be pointing to the right (state 1, 1). +The second row says that when we start our programme with both Bits pointed to the right (state 1, 1), we want the first +Bit to be pointing to the left now (state 0) while the second Bit +stays pointed to the right (state 1) after the programme has finished. + +
+
+
+A specification: Two input output pairs describing Bit positions before and after running the programme +
+ +In summary, the agent is given a **specification and a CF matrix**, which it then has to modify such that the latter +describes a programme which fulfills the former. So a programme, that, if we run +it according to how it is represented in the matrix, will flip the Bits in the way fixed in our specification. + +The challenge that was just introduced can be solved by the agent by connecting the first with the second Bit via the +right control-out pin, which results in the following CF matrix: + +
+
+
+The control flow matrix solving the specified challenge +
+ +This results in the following Turing Tumble Board which, as can be seen, fulfils the specification. + +
+
+
+The solution of the challenge visualised +
+ +## + +## 2. Technical Implementation + +#### Openai-Gym Environment + +The Turing Tumble Environment is implemented in two parts: The BugBit logic we use to evaluate the agent after each step +is written in Java. It is then wrapped in an openai-gym environment which exposes it to RLlib for the training process. +The resulting environment is registered as +`bugbit-v0`. The basic anatomy of an openai-gym environment is as follows: + +1. `__init__()`: Initialise the environment. +2. `reset()`: called at beginning of each episode. Reset the environment to a new state. + *In our case, sets all entries in the CF matrix, as well as input and output pairs to zero.* +3. `step()`: called at each time step. Return observation, reward, done, info. + *Lets the agent change one entry in the CF matrix and evaluates the resulting input/output pairs, gives back new CF + matrix and input/output as observed state, evaluates whether challenge is solved and rewards agent.* + +For a more detailed technical description of gym environments, see +the [openai-gym documentation](https://www.gymlibrary.ml/). + +#### Reward Function + +* -1 for every step taken. +* 10 for winning (i.e. taking an action which produces a control flow matrix that produces the same outputs as the + target matrix). +* -1 for not completing the game in the predetermined amount of steps. + +## 3. Training and Hyperparameter Optimisation + +As the problem space is quite complex and the agent cannot actually observe the BugBit mechanics behind the CF matrix, +we set up a training pipeline consisting of supervised pretraining followed by the actual reinforcement learning part. +By this we hope to give the agent a proper understanding of the environment and speed up learning. + +#### Supervised Pretraining + +Our neural network has two heads, a policy head and a value head. We pretrain the policy head with expert play in a +classical supervised setting. For this, the network is given a partially-completed CF matrix and a specification (input +and output samples). As output it then produces a vector indicating which element in the CF matrix to manipulate next. +Following the expert play concept, the network is trained to follow a fixed solution algorithm when predicting which +edge to set or delete next. We train the network minimising the KL divergence between the 'ideal' policy vector +following the expert algorithm and the one produced by the network. + +It is important to note here that the 'ideal' algorithm is not perfectly efficient. This is where the reinforcement +learning part comes in -- by penalising the agent at each time step, we aim to encourage efficient and creative problem +solving. + +#### Reinforcement Learning with Curriculum Learning (CL) + +Both the reinforcement learning part and hyperparameter optimisation are performed +with [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) and implemented +in `reinforcement_learning > train.py`. + +To gradually have the agent learn how to fulfil a specification by manipulating the CF matrix, the reinforcement +learning environment implements curriculum learning (more specifically: [vanilla CL](https://arxiv.org/abs/2101.10382)). +In our setup, once the agent achieves a certain win-rate threshold on problems of the easiest difficulty (only one entry +in the CF matrix to be changed), the trainer will call `env.increment_phase()` such that all following episodes are now +of the next hardest difficulty (two steps away from a solution). This process repeats as the agent again increases its +win-rate on those now more difficult problems. + +To gradually have the agent learn how to fulfill a specification by manipulating the CF matrix, the reinforcement +learning environment implements curriculum learning (more specifically: [vanilla CL](https://arxiv.org/abs/2101.10382)). +In our setup, once the agent achieves a certain win-rate threshold on problems of the easiest difficulty (only one entry +in the CF matrix to be changed), the trainer will call `env.increment_phase()` such that all following episodes are now +of the next hardest difficulty (two steps away from a solution). This process repeats as the agent again increases its +win-rate on those now more difficult problems. + +#### Hyperparameter Optimisation (HPO) + +The [Asynchronous Hyperband Scheduler](https://arxiv.org/abs/1810.05934) is +used as the trial scheduling algorithm and [hyperopt search](http://hyperopt.github.io/hyperopt/) is used to select +suitable trials for HPO. + +## 4. Data + +The directory `data` contains training sets and persisted models. The structure is as follows: + +| Path | Role | +|------------------------------------------------|------------------------------------------------------------------| +| `data/agent_checkpoints/` | Checkpoints of the RL agents | +| `data/agent_checkpoints/bugbit/` | Checkpoints of the RL BugBit agent created during training | +| `data/agent_checkpoints/connect_four/` | Checkpoints of the RL Connect Four agent created during training | +| `data/model_weights` | Weights of pretrained models | +| `data/training_sets/pretraining_training_sets` | Training sets for supervised dataset_generators | +| `data/training_sets/rl_training_sets` | Training sets for RL | + +## 5. Training Set Generation + +### 5.1. Generating Pretraining Training Sets + +To generate pretraining samples, we first generate Bugbit programmes for ***n*** bits by randomly adding edges between +them subject to the constraint that no loops are allowed. We then generate a full specification for such a programme +by running it for all possible input configurations. Of these ***2n*** input-output pairs, we keep half. +Using +the expert play algorithm 'solver', we generate CF matrices from these incomplete specifications in a stepwise +'waterfall principle' fashion as described in section 1.1. To create the final pretraining samples, we take these CF +matrices and all their +intermediate versions ('timeline') and randomly flip up to three entries in each, thereby adding or removing edges in +the BugBit programme. We generate the target policy vector by identifying which changes need to be reverted to come back +to the original state of the control flow matrix, or to progress in the 'timeline' according to the 'solver' algorithm. + +By learning how to delete superfluous edges/add edges which move the state of the CF matrix from one intermediate state +to the next, the agent learns how to 'reflexively' execute the expert play algorithm. + +The reason why we only use half of each specification set as inputs to our training is that for the full set, there +exists +only one single correct CF matrix satisfying them. In such a setting, we wouldn't teach the agent creative problem +solving and reinforcement learning would not make sense methodically. + +### 5.2. Generating RL Training Sets + +To create the reinforcement trainings data set, we first generate random ***n*** bit programmes and their corresponding +full +specification as above. We then randomly keep half of the input-output pairs of the specification. To get training +examples of one to ***m*** steps away from a valid solution, we iteratively add/remove edges from the original +programmes, +that is, we flip entries in the CF matrix. The resulting training set can then be divided into degrees of difficulty by +the maximum 'distance', i.e. amount of steps needed to reach a known solution, which will still fulfill the +specification. Moving up in difficulty during the CL training process will then introduce samples with a larger ' +distance' into the RL training process + +## 6. Callbacks + +The callbacks are implemented in `reinforcement_learning/callbacks > custom_metric_callbacks.py`. The callbacks are +generally used to log various metrics during training. In our case, we log the win-rate (i.e. how many challenges are +solved within the step-limit) per epoch and then use the callback function `on_train_result()` to increment the phase of +the environment if the win-rate is above the aforementioned threshold. For a more detailed overview of the RLlib +Callbacks API see [RLib Callbacks](https://docs.ray.io/en/latest/_modules/ray/rllib/agents/callbacks.html). + +## 7. Custom Pytorch Models + +To train the Reinforcement Learning Agent a custom Pytorch model is used. The model is implemented +in `reinforcement_learning/custom_torch_models > rl_fully_connected_network.py`. +It is a simple fully connected neural network. The number of layers, types of activation functions, etc. can be defined +via a config dictionary as it is done in `train.py` for the Reinforcement Learning and +in `reinforcement_learning/custom_torch_models > rl_network_pretraining.py` for the Pretraining. +As we use a PPO trainer the network consists of two subnetworks: the policy network and the value network. +The policy network is used to select the action to take and the value network is used to estimate the value of the +state. +The custom model implements the **TorchModelV2** and the **torch.nn.Module** module. +For a more detailed description of custom models in RLlib, see +the [Model APIs](https://docs.ray.io/en/latest/rllib/package_ref/models.html). + +## 8. Utilities + +The utilities package contains two modules: + +1. *utilities.utilities*: exports the paths of important directories to environment variables. +2. *utilities.registration*: registers the environments and custom models with RLlib. + +## 9. Training Setup + +A typical training pipeline is as follows: +First of all, the pretraining and the reinforcement learning training sets are generated using both scripts +`reinforcement_learning/dataset_generators > pretraining_dataset_generation.py` and +`reinforcement_learning/dataset_generators > rl_trainingset_generation.py`. + +After that, supervised pretraining can be performed +in `reinforcement_learning/custom_torch_models > rl_network_pretraining.py`. +During training, in each epoch the accuracy of the train and test data is computed, showing to which percentage the +agent +chooses a correct action. +Furthermore, every 10 epochs a rollout is performed, where the agent has to solve 100 random challenges from the test +data within at most 50 actions. +The progress of the rollout during the training is also logged. + +Finally, in `reinforcement_learning > train.py` the RL agent can be trained on the RL training set using the pretrained +model as an +initialization. + +## 10. Rollout + +After the agent has been pretrained or completely trained, rollouts can be performed to have a visualisation of the +actions the agent takes during a challenge. +Therefore, the`reinforcement_learning/custom_torch_models/rl_network_rollout.py` script can be used. The script loads +a specified number of test samples from the test data and performs a rollout with the agent on each test sample. \ No newline at end of file diff --git a/docs/translators.md b/docs/translators.md new file mode 100644 index 0000000..d4baf83 --- /dev/null +++ b/docs/translators.md @@ -0,0 +1,57 @@ +# Translator Modules + +This readme describes all 'translator scripts' used in this project. +They can be found in `reinforcement_learning/translators/` + +## Contents + +1. [Introduction and Motivation](#intro-and-mot) +2. [Constituent Components and Their Functions](#components-and-functions) +3. [Setup and Run Instructions](#setup-and-run) + +data:image/s3,"s3://crabby-images/2f6f7/2f6f7a2b149c5b99fc8f6cb83e75ca3014a06d8f" alt="Translators in the context of this project" + +### 1. Introduction and Motivation + +The translator modules T1 and T2 are the bridges between Jesse Crossen's GUI of the physical Turing Tumble (TT) board +and our internal representation (BugBit) used in the reinforcement learning environment. Thereby, they do not only serve +as a visualisation device for solutions obtained by the agent, but also as a proof of concept: as both challenges and +solutions can be represented as valid physical TT boards, we have demonstrated that our RL pipeline is fundamentally +suitable to teach the agent how to play the actual game. + +As illustrated in the schematic above, T1 translates our RL environment's control flow (CF) matrix into a graphical TT +board, whereas T2 translates a board from the GUI into a CF matrix and generates the corresponding executable BugBit +code. + +### 2. Constituent Components and Their Functions + +The directory translators contains the translator modules T1 and T2, as well as auxiliary functions and TT templates for +upload to the Jesse Crossen GUI. + +| Path | Role | +|--------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `translators/translatorT1_cf_matrix_to_image.py` | Takes CF matrix and, via an intermediate representation, translates it into TT board with the help of functions in aux_partial_orderer and aux_matrix_to_image. | +| `translators/startup_translatorT2_server.py` | Starts local server for the Jesse Crossen TT GUI, which the user can then set up and download. Downloaded board is automatically translated into CF matrix and BugBit code via aux_image_to_code. | +| `translators/aux_partial_orderer.py` | Contains functions for creating the intermediate board representation. | +| `translators/aux_matrix_to_image.py` | Turns the intermediate board representation into a template suitable for upload on the TT GUI server. | +| `translators/aux_image_to_code.py` | Takes the intermediate representation downloaded from the TT GUI server and translates it into a CF matrix. With the help of the Java Bugbit environment then turns this matrix into BugBit code. | +| `translators/assets/defaultState.png` | The standard TT board. | +| `translators/assets/newDefaultState.png` | 'Reduced' TT board with only one marble colour. Used in our RL pipeline. | + +### Setup and Run Instructions [ADD TUTORIAL VIDEOS HERE??] + +#### T1: From Matrix to Board + +To visualise a solution obtained by the agent, manually enter the returned CF matrix into the template at the bottom +of `translatorT1_cf_matrix_to_image.py` and run the script. + +#### T2: From Board to Matrix + +The following scripts should be run locally. +To obtain a control flow matrix from a given physical board, first run `startup_translatorT2_server.py`. This starts up +a local server for the Jesse Crossen TT and opens the GUI in a browser tab. **Upload `newDefaultState.png` via the +GUI's upload button.** Then set up a valid TT board in the GUI. When you are done, click the blank button at the +top-right. This will download the board as a `.json` file, which will then automatically be translated into a control +flow matrix, then into BugBit code, and then executed in jpype. The results of the code's execution will be returned in +the console, the specification in BugPlus code will be written to a file which can be found +under `src/de.bugplus/examples.development > Challenge.java`. \ No newline at end of file diff --git a/reinforcement_learning/__init__.py b/reinforcement_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/callbacks/__init__.py b/reinforcement_learning/callbacks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/callbacks/custom_metric_callbacks.py b/reinforcement_learning/callbacks/custom_metric_callbacks.py new file mode 100644 index 0000000..83dfeff --- /dev/null +++ b/reinforcement_learning/callbacks/custom_metric_callbacks.py @@ -0,0 +1,125 @@ +""" +This file implements some of the functions that are defined in RLLib's DefaultCallbacks. +""" + +# standard library imports +from typing import Dict, TYPE_CHECKING + +# 3rd party imports +from ray.rllib.agents.callbacks import DefaultCallbacks +from ray.rllib.evaluation import RolloutWorker +from ray.rllib.env.base_env import BaseEnv +from ray.rllib.policy import Policy +from ray.rllib.evaluation.episode import Episode +from ray.rllib.utils.typing import PolicyID +# noinspection PyPackageRequirements +import torch + +if TYPE_CHECKING: + from ray.rllib.agents.trainer import Trainer + from ray.rllib.evaluation import RolloutWorker + + +class CustomMetricCallbacks(DefaultCallbacks): + + def on_episode_start( + self, + *, + worker: RolloutWorker, + base_env: BaseEnv, + policies: Dict[str, Policy], + episode: Episode, + **kwargs + ) -> None: + """ + Callback run at the start of each episode. + + :param worker: Rollout worker of ray + :param base_env: the base environment + :param policies: policies of the agent + :param episode: the episode + :param kwargs: other arguments + :return: None + """ + # Make sure this episode has just been started (only initial obs + # logged so far). + assert episode.length == 0, ( + "ERROR: `on_episode_start()` callback should be called right " + "after env reset!" + ) + episode.user_data["game_history"] = [] + + def on_trainer_init( + self, + *, + trainer: "Trainer", + **kwargs, + ) -> None: + """ + Callback run when a new trainer instance has finished setup. + This method gets called at the end of Trainer.setup() after all + the initialisation is done, and before actually training starts. + + :param trainer: reference to the trainer + :param kwargs: other arguments + :return: None + """ + if trainer.config["env_config"]["pretraining"]: + trainer.get_policy().model.load_state_dict( + torch.load(trainer.config["env_config"]["pretrained_model_path"])) + + def on_episode_end( + self, + *, + worker: "RolloutWorker", + base_env: BaseEnv, + policies: Dict[PolicyID, Policy], + episode: Episode, + **kwargs, + ) -> None: + """ + Is called at the end of each episode. Appends the game history to the episode user data. + + :param worker: Rollout worker of ray + :param base_env: the base environment + :param policies: policies of the agent + :param episode: the episode + :param kwargs: other arguments + :return: None + """ + # Check if there are multiple episodes in a batch, i.e. + # "batch_mode": "truncate_episodes". + if worker.policy_config["batch_mode"] == "truncate_episodes": + # Make sure this episode is really done. + assert episode.batch_builder.policy_collectors["default_policy"].batches[ + -1 + ]["dones"][-1], ( + "ERROR: `on_episode_end()` should only be called " + "after episode is done!" + ) + + info = episode.last_info_for() + episode.user_data["game_history"].append(info["won"]) + episode.custom_metrics["game_history"] = episode.user_data["game_history"][0] + episode.hist_data["game_histories"] = episode.user_data["game_history"] + + def on_train_result( + self, + *, + trainer: "Trainer", + result: dict, + **kwargs + ) -> None: + """ + This function is called after the trainer has finished training one epoch. + + :param trainer: reference to the trainer + :param result: train result + :param kwargs: other arguments + :return: None + """ + if result["custom_metrics"]["game_history_mean"] > 0.85: + print("incrementing phase") + trainer.workers.foreach_worker( + lambda ev: ev.foreach_env( + lambda env: env.increment_phase())) diff --git a/reinforcement_learning/connect_four/__init__.py b/reinforcement_learning/connect_four/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/connect_four/connect_four_gui.py b/reinforcement_learning/connect_four/connect_four_gui.py new file mode 100644 index 0000000..1521c80 --- /dev/null +++ b/reinforcement_learning/connect_four/connect_four_gui.py @@ -0,0 +1,339 @@ +""" +GUI for Connect Four +""" + +# standard library imports +import time +from typing import Optional +import sys + +# 3rd party imports +import gym +import numpy as np +import PySimpleGUI as sg +import ray.rllib.agents.ppo as ppo +# noinspection PyPackageRequirements +import torch +import ray +from ray.rllib.utils.typing import EnvType + +# local imports (i.e. our own code) +from connect_four.helpers import determine_color, get_game_mode, invert_board, restore_agents, return_layout, \ + get_latest_agent_checkpoint +# noinspection PyUnresolvedReferences +from utilities import utilities, registration + + +class ConnectFourGUI: + """ + This class implements the GUI for the Connect Four game. + """ + + agent = None + env = None + render_mode: str = None + + def __init__( + self, + environment: EnvType, + game_mode: Optional[int] = 1, + agent1=None, + agent2=None + ): + """ + Initializes the GUI for the Connect Four game. + + :param environment: the connect four environment + :param game_mode: the game mode the GUI is started with + :param agent1: the first agent + :param agent2: the second agent + """ + self.env = environment + self.agent1 = agent1 + self.agent2 = agent2 + + print(f"game_mode: {game_mode}") + if game_mode == 2 and not agent1: + sys.exit("no agent passed") + elif game_mode == 3 and not agent1 or not agent2: + sys.exit("either agent1 or agent2 missing") + if game_mode == 0: + self.greedy() + if game_mode == 1: + self.ai() + elif game_mode == 2: + self.ai_vs_ai() + elif game_mode == 3: + self.ai_vs_greedy() + elif game_mode == 4: + self.greedy_vs_greedy() + + def _update_layout(self, window): + """ + Call to update layout + + :param window: PySimpleGUI window + :return: None + """ + for i in range(len(self.env.state)): + for j in range(len(self.env.state[0])): + window[f'{i},{j}'].update(background_color=determine_color((i, j), self.env.state)) + + def greedy(self): + """ + Starts a GUI game against the greedy agent + + :return: None + """ + layout = return_layout(env_state=self.env.state) + + window = sg.Window( + 'ConnectFour', + layout=layout, + font="Helvetica", + background_color="white" + ) + while True: + event, values = window.read() + + player_action = int(event) - 1 + print(f"player_action is: {player_action}") + self.env.state, reward, self.env.done, info = self.env.interactive_step( + player_action, agent_id=1) + self._update_layout(window) + + if self.env.done: + self._update_layout(window) + break + self.env.state, reward, self.env.done, info = self.env.interactive_step( + self.env.get_greedy_action(2), agent_id=2) + self._update_layout(window) + if self.env.done: + self._update_layout(window) + break + window.read(timeout=5000) + window.close() + + window.close() + + def greedy_vs_greedy(self): + """ + Starts a game where the greedy agent plays against another greedy agent + + :return: None + """ + layout = return_layout(env_state=self.env.state) + window = sg.Window( + 'ConnectFour', + layout=layout, + font="Helvetica", + background_color="white" + ) + while True: + window.read(timeout=1000) + + self.env.state, reward, self.env.done, info = self.env.interactive_step( + self.env.get_greedy_action(1), agent_id=1) + self._update_layout(window) + + if self.env.done: + self._update_layout(window) + + self.env.state, reward, self.env.done, info = self.env.interactive_step( + self.env.get_greedy_action(2), agent_id=2) + self._update_layout(window) + if self.env.done: + self._update_layout(window) + + break + window.read(timeout=5000) + window.close() + + window.close() + + def ai(self): + """ + Starts the GUI game human vs AI + + :return: None + """ + layout = return_layout(env_state=self.env.state) + + window = sg.Window( + 'ConnectFour', + layout=layout, + font="Helvetica", + background_color="white" + ) + while True: + event, values = window.read() + player_action = int(event) - 1 + print(f"player_action is: {player_action}") + self.env.connectfour.placeToken(player_action, 2) + self.env.state = self.env.connectfour.getState() + + self._update_layout(window) + if self.env.done: + self._update_layout(window) + break + + agent_action = self.agent1.compute_action(invert_board(self.env.state)) + + self.env.state, reward, self.env.done, self.env.info = self.env.connectfour.interactive_step(agent_action) + self.env.state = np.array(self.env.state) + self.env.done = bool(self.env.done) + self.env.info = {} + + # self.env.reward += int(reward) + + self._update_layout(window) + + if self.env.done: + self._update_layout(window) + break + window.read(timeout=5000) + window.close() + + def ai_vs_greedy(self): + """ + Starts a gui game AI vs greedy. + + :return: None + """ + layout = return_layout(env_state=self.env.state) + + window = sg.Window( + 'ConnectFour', + layout=layout, + font="Helvetica", + background_color="white" + ) + player = True + while True: + window.read(timeout=1000) + if player: + self._update_layout(window) + agent_1_action = self.agent1.compute_action(self.env.state) + + self.env.state, reward, self.env.done, self.env.info = self.env.interactive_step(agent_1_action, + 1) + else: + window.read(timeout=2000) + self.env.state, reward, self.env.done, info = self.env.interactive_step( + self.env.get_greedy_action(2), agent_id=2) + self._update_layout(window) + player = not player + if self.env.done: + self._update_layout(window) + self.env.reset() + break + + window.read(timeout=5000) + window.close() + + window.close() + + def ai_vs_ai(self): + """ + Starts a gui game AI vs AI. + + :return: None + """ + layout = return_layout(env_state=self.env.state) + window = sg.Window( + 'ConnectFour', + layout=layout, + font="Helvetica", + background_color="white" + ) + player = True + while True: + window.read(timeout=500) + self._update_layout(window) + if player: + agent_1_action = self.agent1.compute_action(self.env.state) + self.env.state, reward, self.env.done, self.env.info = self.env.interactive_step(agent_1_action, + 1) + self.env.state = np.array(self.env.state) + self.env.done = bool(self.env.done) + self.env.info = {} + self._update_layout(window) + elif not player: + agent_2_action = self.agent2.compute_action(invert_board(self.env.state)) + self.env.state, reward, self.env.done, self.env.info = self.env.interactive_step(agent_2_action, + 2) + self.env.state = np.array(self.env.state) + self.env.done = bool(self.env.done) + self.env.info = {} + self._update_layout(window) + + if self.env.done: + self._update_layout(window) + break + player = not player + + window.read(timeout=5000) + window.close() + + window.close() + + def console_no_interaction(self): + """ + Starts the ConnectFour game without interaction in the console. + + :return: None + """ + + def _print_game(): + n, _ = np.array(self.env.state).shape + print("==============================") + print(" 1 2 3 4 5 6 7") + for i in range(n): + row = self.env.state[i].copy() + row = [" -" if x == 0 else x for x in row] + row = [" X" if x == 1 else x for x in row] + row = [" O" if x == 2 else x for x in row] + + print(*row) + print() + print("==============================") + + while not self.env.done: + action = self.agent1.compute_single_action(self.env.state) + self.env.state, reward, self.env.done, self.env.info = self.env.step(action) + self.env.reward += reward + _print_game() + time.sleep(1) + + time.sleep(3) + + +def start_game(): + """ + Starts the ConnectFour game. + + :return: None + """ + ray.init(local_mode=True) + + trainer_config = ppo.DEFAULT_CONFIG.copy() + trainer_config["framework"] = "torch" + trainer_config["model"] = { + "custom_model": "custom_torch_fcnn", + "custom_model_config": { + "fcnet_hiddens": [256, 256, 256], + "fcnet_activation": torch.nn.ReLU, + "no_final_layer": False, + "vf_share_layers": False, + "free_log_std": False + }, + } + env = gym.make("connectfour-v0") + + agents = restore_agents([get_latest_agent_checkpoint(), get_latest_agent_checkpoint()], trainer_config) + + ConnectFourGUI(env, get_game_mode(), agent1=agents[0], agent2=agents[1]) + + +if __name__ == "__main__": + start_game() diff --git a/reinforcement_learning/connect_four/helpers.py b/reinforcement_learning/connect_four/helpers.py new file mode 100644 index 0000000..e5b2b48 --- /dev/null +++ b/reinforcement_learning/connect_four/helpers.py @@ -0,0 +1,198 @@ +""" +Contains helper functions for the connect four module. +""" + +# standard library imports +from typing import List +import sys +import os + +# 3rd party imports +import numpy as np +# noinspection PyPep8Naming +import PySimpleGUI as sg +from ray.rllib.agents.ppo.ppo import PPOTrainer +# noinspection PyPackageRequirements +import torch +import ray.rllib.agents.ppo as ppo + + +# functions +def get_config() -> dict: + """ + Returns the config for the PPO agent with the necessary settings for the connect four environment + + :return: dictionary containing the config + """ + config = ppo.DEFAULT_CONFIG.copy() + config["framework"] = "torch" + config["model"] = { + "custom_model": "custom_torch_fcnn", + "custom_model_config": { + "fcnet_hiddens": [256, 256, 256], + "fcnet_activation": torch.nn.ReLU, + "no_final_layer": False, + "vf_share_layers": False, + "free_log_std": False + }, + } + + return config + + +def return_layout(env_state) -> list: + """ + Returns the layout of the board. + + :param env_state: state of the connect four environment + :return: list describing the layout + """ + return [ + [ + [ + sg.Text( + size=(7, 4), justification="center", + background_color=determine_color((row, col), env_state), + border_width=1, + key=f"{row},{col}") + for col in range(7) + ] + for row in range(6) + ], [ + [ + sg.Button( + f"{col + 1}", + size=(6, 1), + button_color="#343864", + mouseover_colors="#ed5853" + ) + for col in range(7) + ] + ] + ] + + +def determine_color(coordinates: tuple, board: List[List]) -> str: + """ + Determines the colour of a piece at a given coordinate. + + :param coordinates: the coordinates of the piece + :param board: the board + :return: the colour of the piece + """ + if board[coordinates[0]][coordinates[1]] == 0: + return "white" + if board[coordinates[0]][coordinates[1]] == 1: + return "#ed5853" + else: + return "#343864" + + +def get_game_mode(): + """ + Gets the game mode from the console. + + :return: game_mode + """ + game_modes = { + "--humanvsgreedy": 0, + "--humanvsai": 1, + "--aivsai": 2, + "--aivsgreedy": 3, + "--greedyvsgreedy": 4 + } + args = sys.argv[1:] + if len(args) < 1: + print("No argument found") + sys.exit(0) + if args[0] not in game_modes.keys(): + print('This game mode does not exist') + sys.exit(0) + return game_modes[args[0]] + + +def get_paths(game_mode: int): + """ + Returns the paths necessary to start the agents for Ai vs Human or AI vs AI (must be absolute paths) + + :param game_mode: int + """ + agent_path1 = None + agent_path2 = None + if game_mode >= 2: + agent_path1 = input("Agent game path: ") + if game_mode >= 3: + agent_path2 = input("Agent game path: ") + if game_mode == 0: + agent_path1 = input("Agent game path: ") + return [agent_path1, agent_path2] + + +def cast_to_np_array(conv): + """ + Converts an array to np array + + :param conv: int + :return: np_array + """ + helper_array = [[0] * 7 for i in range(6)] + for i in range(6): + for j in range(7): + helper_array[i][j] = conv[i][j] + return np.array(helper_array, dtype=np.float32) + + +def invert_board(board: np.ndarray): + """ + Player 1 gets inverted to player 2, player 2 to player 1 + + :param board: np.array + :return: np.array + """ + tmp = cast_to_np_array(board) + b1 = tmp.copy() + b1 = np.where(b1 == 1, 3, b1) + b1 = np.where(b1 == 2, 1, b1) + return np.where(b1 == 3, 2, b1) + + +def restore_agents(paths, config): + """ + Gets agents from multiple paths + + :param paths: string[] + :param config: + :return: agents[] + """ + agent1 = PPOTrainer(env="connectfour-v0", config=config) + agent2 = PPOTrainer(env="connectfour-v0", config=config) + if paths[0]: + agent1.restore(paths[0]) + if paths[1]: + agent1.restore(paths[1]) + return [agent1, agent2] + + +def preprocess_results(result: dict) -> dict: + """ + Removes the Observation space from the training result dictionary returned + by agent.train() since it could not be serialised by wandb + + :param result: dictionary of type rllib.agent.train() + :return: modified input dict + """ + result["config"]["multiagent"].pop("policies", None) + result["config"]["evaluation_config"]["multiagent"].pop("policies", None) + return result + + +def get_latest_agent_checkpoint(): + """ + Gets the latest checkpoint of the agent + + :return: latest agent checkpoint from data/agent_checkpoints/connect_four + """ + agent_checkpoints = os.listdir(f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/agent_checkpoints/connect_four") + agent_checkpoints.sort(key=lambda x: int(x.split("_")[-1].split(".")[0])) + return f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/agent_checkpoints/connect_four/{agent_checkpoints[-1]}" \ + f"/checkpoint-{agent_checkpoints[-1].split('_')[1].lstrip('0')}" diff --git a/reinforcement_learning/connect_four/train_connect_four.py b/reinforcement_learning/connect_four/train_connect_four.py new file mode 100644 index 0000000..35abcc1 --- /dev/null +++ b/reinforcement_learning/connect_four/train_connect_four.py @@ -0,0 +1,68 @@ +""" +Training file for connect four. +""" + +# standard library imports +import shutil +import os + +# 3rd party imports +import ray +import ray.rllib.agents.ppo as ppo +import wandb + +# local imports (i.e. our own code) +# noinspection PyUnresolvedReferences +from utilities import utilities +# noinspection PyUnresolvedReferences +from utilities import registration +import connect_four.helpers as helpers + +wandb.login() +wandb.init(project="connect-four", entity="mtp-ai-board-game-engine") + + +def main(): + """ + Main training function for connect four. + + :return: None + """ + # init directory in which to save checkpoints + chkpt_root = f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/agent_checkpoints/connect_four" + shutil.rmtree(chkpt_root, ignore_errors=True, onerror=None) + + # init directory in which to log results + ray_results = "{}/ray_results/".format(os.getenv("HOME")) + shutil.rmtree(ray_results, ignore_errors=True, onerror=None) + + # start Ray -- add `local_mode=True` here for debugging + ray.init() + + # configure the environment and create agent + config = helpers.get_config() + config["num_gpus"] = 0 + config["num_workers"] = 4 + + agent = ppo.PPOTrainer(env="connectfour-v0", config=config) + + # change the number of iterations to train for in range() + for n in range(100): + result = agent.train() + helpers.preprocess_results(result) + wandb.log(result) + agent.save(chkpt_root) + + print( + f"ITERATION {n + 1:2d}, " + f"min: {result['episode_reward_min']:8.2f}, " + f"mean: {result['episode_reward_mean']:8.2f}, " + f"max: {result['episode_reward_max']:8.2f}, " + f"mean length: {result['episode_len_mean']:8.2f} " + ) + + +# running this file will train a PPO agent and save the checkpoints in the +# data/agent_checkpoints/connect_four directory +if __name__ == "__main__": + main() diff --git a/reinforcement_learning/custom_torch_models/__init__.py b/reinforcement_learning/custom_torch_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/custom_torch_models/rl_fully_connected_network.py b/reinforcement_learning/custom_torch_models/rl_fully_connected_network.py new file mode 100644 index 0000000..b128136 --- /dev/null +++ b/reinforcement_learning/custom_torch_models/rl_fully_connected_network.py @@ -0,0 +1,449 @@ +""" +The neural network we wish to use instead of the default network provided by ray's PPO trainer. +""" + +# standard library imports +import os +import sys +import random +import copy +import logging +import warnings +from typing import Optional + +# 3rd party imports +import numpy as np +import gym +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.misc import SlimFC, AppendBiasLayer, normc_initializer +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import Dict, TensorType, List, ModelConfigDict +import wandb +import torch + +from torch.utils.data import DataLoader +from tqdm import tqdm + +# local imports +from dataset_generators.utils import cf_to_lower_triangular_flattened +from dataset_generators.pretraining_dataset_generation import read_samples, read_programs + +np.set_printoptions(threshold=sys.maxsize) +torch.set_printoptions(profile="full") +warnings.filterwarnings("ignore", category=UserWarning) + +torch, nn = try_import_torch() + +logger = logging.getLogger(__name__) + + +class FullyConnectedNetwork(TorchModelV2, nn.Module): + """Generic fully connected network.""" + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + ): + """ + Initialises the network. + + :param obs_space: The observation space our agent receives. + :param action_space: The action space available to our agent. + :param num_outputs: The number of outputs our network should produce. + :param model_config: The model configuration. + :param name: The name of our model. + """ + + # update the config + model_config.update(model_config["custom_model_config"]) + TorchModelV2.__init__( + self, obs_space, action_space, num_outputs, model_config, name + ) + nn.Module.__init__(self) + + # define the hidden layers + hiddens = list(model_config.get("fcnet_hiddens", [])) + list( + model_config.get("post_fcnet_hiddens", []) + ) + # define the activation function + activation = model_config.get("fcnet_activation") + if not model_config.get("fcnet_hiddens", []): + activation = model_config.get("post_fcnet_activation") + + # define whether output layer should be linear or not + no_final_linear = model_config.get("no_final_linear") + + # The PPO Trainer has a value and a policy branch. Defines whether they should use the same hidden layers + # and only use unique prediction heads in the final output layer (True) or we should build two unique networks + # who only share the same input (observation space) (False). + self.vf_share_layers = model_config.get("vf_share_layers") + + # For DiagGaussian action distributions, make the second half of the model + # outputs floating bias variables instead of state-dependent. + self.free_log_std = model_config.get("free_log_std") + + # Generate free-floating bias variables for the second half of the outputs. + if self.free_log_std: + assert num_outputs % 2 == 0, ( + "num_outputs must be divisible by two", + num_outputs, + ) + num_outputs = num_outputs // 2 + + # Create the hidden layers. + layers = [] + prev_layer_size = int(np.product(obs_space.shape)) + self._logits = None + + for size in hiddens[:-1]: + layers.append( + SlimFC( + in_size=prev_layer_size, + out_size=size, + initializer=normc_initializer(1.0), + activation_fn=activation, + ) + ) + prev_layer_size = size + + # The last layer is adjusted to be of size num_outputs, but it's a layer with activation. + if no_final_linear and num_outputs: + layers.append( + SlimFC( + in_size=prev_layer_size, + out_size=num_outputs, + initializer=normc_initializer(1.0), + activation_fn=activation, + ) + ) + prev_layer_size = num_outputs + + # Finish the layers with the provided sizes (`hiddens`), plus (iff num_outputs > 0) + # a last linear layer of size num_outputs. + else: + if len(hiddens) > 0: + layers.append( + SlimFC( + in_size=prev_layer_size, + out_size=hiddens[-1], + initializer=normc_initializer(1.0), + activation_fn=activation, + ) + ) + prev_layer_size = hiddens[-1] + if num_outputs: + self._logits = SlimFC( + in_size=prev_layer_size, + out_size=num_outputs, + initializer=normc_initializer(0.01), + activation_fn=torch.nn.LogSoftmax + ) + else: + self.num_outputs = ([int(np.product(obs_space.shape))] + hiddens[-1:])[ + -1 + ] + + # Layer to add the log std vars to the state-dependent means. + if self.free_log_std and self._logits: + self._append_free_log_std = AppendBiasLayer(num_outputs) + + self._hidden_layers = nn.Sequential(*layers) + + self._value_branch_separate = None + if not self.vf_share_layers: + # Build a parallel set of hidden layers for the value net. + prev_vf_layer_size = int(np.product(obs_space.shape)) + vf_layers = [] + for size in hiddens: + vf_layers.append( + SlimFC( + in_size=prev_vf_layer_size, + out_size=size, + activation_fn=activation, + initializer=normc_initializer(1.0), + ) + ) + prev_vf_layer_size = size + self._value_branch_separate = nn.Sequential(*vf_layers) + + self._value_branch = SlimFC( + in_size=prev_layer_size, + out_size=1, + initializer=normc_initializer(0.01), + activation_fn=None, + ) + # Holds the current "base" output (before logits layer). + self._features = None + # Holds the last input, in case value branch is separate. + self._last_flat_in = None + + if model_config.get("pretraining"): + self.load_state_dict(torch.load(model_config.get("pretrained_model_path"))) + + @override(TorchModelV2) + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> (TensorType, List[TensorType]): + """ + Forward pass of the network. + + :param input_dict: The inputs for the forward pass. + :param state: list of state tensors + :param seq_lens: 1d tensor holding input sequence lengths + :return: The logits produced as a result of the forward pass as well as the state + """ + + obs = input_dict["obs_flat"].float() + self._last_flat_in = obs.reshape(obs.shape[0], -1) + self._features = self._hidden_layers(self._last_flat_in) + logits = self._logits(self._features) if self._logits else self._features + if self.free_log_std: + logits = self._append_free_log_std(logits) + + return logits, state + + def custom_forward(self, X): + """ + A custom forward function for pretraining the network. It is different to the forward function as in pretraining + there is no state and input dict. + + :param X: Input tensor with training data for the forward pass + :return: The logits produced as a result of the forward pass + """ + self._last_flat_in = X + self._features = self._hidden_layers(self._last_flat_in) + logits = self._logits(self._features) if self._logits else self._features + if self.free_log_std: + logits = self._append_free_log_std(logits) + return logits + + @override(TorchModelV2) + def value_function(self) -> TensorType: + """ + Computes the result of the value branch of the network. + + :return: Result of the value branch. + """ + + assert self._features is not None, "must call forward() first" + if self._value_branch_separate: + return self._value_branch( + self._value_branch_separate(self._last_flat_in) + ).squeeze(1) + else: + return self._value_branch(self._features).squeeze(1) + + def sample_train( + self, + x: torch.Tensor, + y: torch.Tensor, + x_test: torch.Tensor, + y_test: torch.Tensor, + t_test: torch.Tensor, + num_bugs: Optional[int] = 5, + zero_rollout: Optional[bool] = False, + num_epochs: Optional[int] = 50, + learning_rate: Optional[float] = 0.001, + batch_size: Optional[int] = 100 + ): + """ + Train the network on the given data. + + :param x: Training samples. + :param y: Training labels. + :param x_test: Test samples. + :param y_test: Test labels. + :param t_test: The template programs (the CF Matrix) of the test samples. + :param num_bugs: Number of bugs we are using for the training. + :param zero_rollout: Do the rollout with empty CF matrices. + :param num_epochs: Number of epochs to train for. + :param learning_rate: Learning rate for the optimiser. + :param batch_size: Batch size for the optimiser. + :return: the trained model + """ + + # use a TensorDataset + dataset = torch.utils.data.TensorDataset(x, y) + train_loader = torch.utils.data.DataLoader( + dataset, batch_size=batch_size, shuffle=True + ) + losses = [] + test_losses = [] + + all_tries = [] + train_accuracy = [] + test_accuracy = [] + successful_solves = [] + average_steps_correct = [] + + # sample 100 test samples for the play out/rollout + play_out_indices = random.sample(range(len(x_test)), 100) + + optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) + kl_div_loss = torch.nn.KLDivLoss(reduction='batchmean') + + for epoch in range(num_epochs): + self.train() + batch_losses = [] + train_batch_accuracies = [] + test_batch_accuracies = [] + + # train on each batch + for batch in train_loader: + input, output = batch # .view(batch_size,-1) + if torch.cuda.is_available(): + cuda0 = torch.device('cuda:0') + input = input.to(cuda0) + output = self.custom_forward(input) + y_train = (batch[1]) + loss = kl_div_loss(output, y_train) + + # get the accuracy for the current batch + for out in range(len(output)): + argmax = torch.argmax(output[out]) + if y_train[out][argmax] > 0: # some probability mass must be on this index + train_batch_accuracies.append(1) + else: + train_batch_accuracies.append(0) + batch_losses.append(loss.item()) + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # start testing + self.eval() + with torch.no_grad(): + + outs = self.custom_forward(x_test) + + # get the accuracy for the test samples + for out in range(len(outs)): + argmax = torch.argmax(outs[out]) + + if y_test[out][argmax] > 0: + test_batch_accuracies.append(1) + else: + test_batch_accuracies.append(0) + + test_loss = kl_div_loss(outs, y_test) + test_losses.append(test_loss.item()) + + # Test with play outs + if epoch % 5 == 0: # each 10 epochs + tries = [] + successes = 0 + steps_correct = [] + for pi in tqdm(range(len(play_out_indices))): + + index = play_out_indices[pi] + + # get test sample + rand_sample = copy.deepcopy(x_test[index]) + + if torch.cuda.is_available(): + rand_sample = rand_sample.cpu() + + # get only the cf matrix from the test sample, not the input-output pairs + current_matrix = copy.deepcopy(rand_sample[ + :num_bugs * ( + num_bugs - 1)]) + + # rand sample is now only the input-output pairs + rand_sample = rand_sample[num_bugs * (num_bugs - 1):] + + # current matrix is empty, if we use zero rollout + if zero_rollout: + current_matrix = np.zeros(shape=current_matrix.shape) + + current_input = np.concatenate((current_matrix, rand_sample)) + + # For 50 epochs let the network take actions and break, if the network + # found the correct solution + actions = 0 + for step in range(50): + current_input = torch.from_numpy(current_input).float() + + if torch.cuda.is_available(): + out = self.custom_forward(current_input.cuda()) + # take the action which the agent is most confident about + out_choice = torch.argmax(out) + + # get the desired solution to evaluate if the current matrix is already correct + t1 = current_input[:num_bugs * (num_bugs - 1)] + + t2 = torch.from_numpy(cf_to_lower_triangular_flattened(t_test[index].cpu())).float() + else: + out = self.custom_forward(current_input) + # take the action which the agent is most confident about + out_choice = torch.argmax(out) + + # get the desired solution to evaluate if the current matrix is already correct + t1 = current_input[:num_bugs * (num_bugs - 1)] + + t2 = torch.from_numpy(cf_to_lower_triangular_flattened(t_test[index])).float() + + if torch.equal(t1, t2): + successes += 1 + steps_correct.append(actions) + break + + current_matrix[out_choice] = 1 if current_matrix[out_choice] == 0 else 0 + actions += 1 + current_input = np.concatenate((current_matrix, rand_sample)) + + tries.append(actions) + + all_tries.append(np.mean(tries)) + average_steps_correct.append(np.mean(steps_correct)) + print(f"\nActions needed for solved programs: {steps_correct}") + successful_solves.append(successes) + + train_accuracy.append(np.mean(train_batch_accuracies)) + test_accuracy.append(np.mean(test_batch_accuracies)) + losses.append(np.mean(batch_losses)) + print( + f"EPOCH {epoch} \t" + f"Loss: {losses[-1]} \t" + f"Train Acc: {train_accuracy[-1]} \t" + f"Test Acc: {test_accuracy[-1]} \t" + f"Test Loss: {test_losses[-1]} \t" + f"All tries: {all_tries[-1]} \t" + f"Correct Solutions: {successful_solves[-1]} \t" + f"Average Steps per Correct Solution: {average_steps_correct[-1]}" + ) + if epoch % 10 == 0: + print("Tries: ", all_tries[-1]) + wandb.log( + { + # the average loss of the training batches in each epoch + "train_loss": losses[-1], + + # the average loss on the test data + "test_loss": test_losses[-1], + + # percentage of how many actions were chosen correctly in the train data + "Train Accuracy": train_accuracy[-1], + + # percentage of how many actions were chosen correctly in the test data + "Test Accuracy": test_accuracy[-1], + + # average number of steps in the rollout + "All_tries": all_tries[-1], + + # number of solved challenges in the rollout + "Correct Solutions": successful_solves[-1], + + # average number of steps of the correctly solved challenges in the rollout + "Avg Steps/solution": average_steps_correct[-1] + + } + ) + return self diff --git a/reinforcement_learning/custom_torch_models/rl_network_pretraining.py b/reinforcement_learning/custom_torch_models/rl_network_pretraining.py new file mode 100644 index 0000000..3b5719f --- /dev/null +++ b/reinforcement_learning/custom_torch_models/rl_network_pretraining.py @@ -0,0 +1,131 @@ +""" +This file can be used to pretrain the reinforcement learning agent. +""" + +# standard library imports +import random +import os +from typing import Optional + +# 3rd party imports +import numpy as np +import gym +import torch +import wandb + +# local imports +from reinforcement_learning.custom_torch_models.rl_fully_connected_network import FullyConnectedNetwork +from dataset_generators.pretraining_dataset_generation import read_samples, read_programs +# noinspection PyUnresolvedReferences +from utilities import utilities + + +def pretrain_network( + n_bugs: int, + multiple_actions: Optional[bool], + zero_rollout: Optional[bool] = False, + disjoint_functions: Optional[bool] = False, + config: Optional[dict] = None, + num_epochs: Optional[int] = 50, + test_percentage: Optional[float] = 0.2, + lr: Optional[float] = 0.001, + batch_size: Optional[float] = 100 +): + """ + Pretrains the reinforcement learning agent. Before, the **pretraining_dataset_generator** has to be executed + to create the training set. + + :param n_bugs: number of bugs + :param multiple_actions: whether we want to use training samples, where more than one delete action is possible. + :param zero_rollout: True, if you want to start with an empty CF matrix. + :param disjoint_functions: True, if input-output samples should be disjoint between all training and test samples + :param config: configuration directory to initialize the network + :param num_epochs: number of training epochs + :param test_percentage: percentage of test data in the entire data + :param lr: learning rate + :param batch_size: batch size + :return: + """ + num_outputs = (2 * n_bugs ** 2) // 2 - n_bugs + + if config is None: + config = { + 'custom_model': 'custom_torch_fcnn', + 'custom_model_config': { + 'fcnet_hiddens': [256, 256, 256], + 'fcnet_activation': torch.nn.ReLU, + 'no_final_layer': False, + 'vf_share_layers': False, + 'free_log_std': False + } + } + + # observation and action space are needed to initialise the network + observation_space = gym.spaces.Box(low=0, high=2, + shape=(1, (2 * n_bugs ** 2) // 2 - n_bugs + (2 ** n_bugs // 2) * 2 * n_bugs)) + action_space = gym.spaces.Discrete(((2 * n_bugs ** 2) // 2 - n_bugs)) + net = FullyConnectedNetwork(obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, + model_config=config, name="default_model") + + # read data and create test samples + x, y = read_samples(n_bugs, multiple_actions=multiple_actions) + t = read_programs(n_bugs, multiple_actions=multiple_actions) + + if disjoint_functions: + # take last 20% of training set + test_indices = range(int(len(x) * (1 - test_percentage)), len(x)) + else: + test_indices = random.sample(range(0, len(x)), int(len(x) * test_percentage)) + x_test = np.take(x, test_indices, axis=0) + y_test = np.take(y, test_indices, axis=0) + t_test = np.take(t, test_indices, axis=0) + + x = np.delete(x, test_indices, axis=0) + y = np.delete(y, test_indices, axis=0) + t = np.delete(t, test_indices, axis=0) + x = torch.from_numpy(x).float() + y = torch.from_numpy(y).float() + x_test = torch.from_numpy(x_test).float() + y_test = torch.from_numpy(y_test).float() + t_test = torch.from_numpy(t_test).float() + + print((x_test[0])) + print(y_test[0]) + + print(f"Training Samples: {len(x)}") + print("Testing Samples: {len(X_test)}") + + # prepare the training + name = f"RL_Pretraining_Model_KL_DIV_Training_{str(n_bugs)}-Bugs--lr={str(lr)}--batch_size={str(batch_size)}--Multiple_Actions=" \ + f"{str(multiple_actions)}{str(zero_rollout)}Disjoint_Functions={str(disjoint_functions)}" + + print("New Run: ", name) + + print(os.getcwd()) + path = os.getenv('REINFORCEMENT_LEARNING_DIR') + "/data/model_weights/" + name + print(path) + + wandb.login() + wandb.init(project="Pretraining", entity="mtp-ai-board-game-engine", name=name) + + if torch.cuda.is_available(): + cuda0 = torch.device('cuda:0') + x = x.to(cuda0) + y = y.to(cuda0) + x_test = x_test.to(cuda0) + y_test = y_test.to(cuda0) + t_test = t_test.to(cuda0) + + # pretrain the network + if torch.cuda.is_available(): + net = net.cuda() + net.sample_train(x, y, x_test, y_test, t_test, zero_rollout=zero_rollout, num_bugs=n_bugs, + num_epochs=num_epochs) + + # save pretrained model + torch.save(net.state_dict(), path) + wandb.finish() + + +if __name__ == "__main__": + pretrain_network(n_bugs=3, multiple_actions=True) diff --git a/reinforcement_learning/custom_torch_models/rl_network_rollout.py b/reinforcement_learning/custom_torch_models/rl_network_rollout.py new file mode 100644 index 0000000..2809a9f --- /dev/null +++ b/reinforcement_learning/custom_torch_models/rl_network_rollout.py @@ -0,0 +1,203 @@ +""" +This class is used for performing a rollout on the fully connect reinforcement learning network that has +been pretrained. +""" + +# standard library imports +import warnings +import sys +import random +import copy +import os +from typing import Optional, List + +# 3rd party imports +import numpy as np +import gym +import torch + +# local imports +from dataset_generators.utils import cf_to_lower_triangular_flattened, flattened_repr_to_control_flow_matrix +from dataset_generators.pretraining_dataset_generation import read_samples, read_programs +from custom_torch_models.rl_fully_connected_network import FullyConnectedNetwork + +warnings.filterwarnings("ignore", category=UserWarning) +torch.set_printoptions(threshold=sys.maxsize) + + +def rollout( + model: torch.nn.Module, + x_test: torch.Tensor, + t_test: torch.Tensor, + num_bugs: int, + play_out_indices: List[int], + zero_rollout: Optional[bool] = False +): + """ + This method performs a rollout on the given model. The network is given a set of test samples for which it has 50 + epochs to take actions that reach a target CF-Matrix state. Each step is printed to the console. + + :param model: The pretrained pytorch network + :param x_test: Testing data (CF-Matrix, Input-Output samples) + :param t_test: Target cf matrices + :param num_bugs: Number of bugs + :param play_out_indices: Indices defining which test data to roll out + :param zero_rollout: True, if you want to start with an empty CF matrix. + :return: None + """ + successes = 0 + for pi in range(len(play_out_indices)): + + index = play_out_indices[pi] + rand_sample = copy.deepcopy(x_test[index]) + if torch.cuda.is_available(): + rand_sample = rand_sample.cpu() + current_matrix = copy.deepcopy(rand_sample[ + :num_bugs * ( + num_bugs - 1)]) # np.zeros(shape=(num_bugs * (num_bugs - 1))) + rand_sample = rand_sample[num_bugs * (num_bugs - 1):] + if zero_rollout: + current_matrix = np.zeros(shape=current_matrix.shape) + + current_input = np.concatenate((current_matrix, rand_sample)) + + io_size = len(rand_sample) + inputs = rand_sample[:io_size // 2].numpy() + outputs = rand_sample[-io_size // 2:].numpy() + + inputs = np.reshape(inputs, newshape=(-1, num_bugs)) + outputs = np.reshape(outputs, newshape=(-1, num_bugs)) + + actions = 0 + + print("\n") + print("New Function with following IN-Out-Samples:") + print("Inputs\t\t\t\t\tOutputs") + for io in range(len(inputs)): + print(inputs[io], "\t", outputs[io]) + + print("\nTarget Matrix:") + if torch.cuda.is_available(): + t2 = torch.from_numpy(cf_to_lower_triangular_flattened(t_test[index].cpu())).float() + print(t_test[index].cpu()) + else: + t2 = torch.from_numpy(cf_to_lower_triangular_flattened(t_test[index])).float() + print(t_test[index]) + + for step in range(50): + print("\n") + print(f"Input-Matrix at Step {step}:") + print(flattened_repr_to_control_flow_matrix(current_matrix, num_bugs)) + + current_input = torch.from_numpy(current_input).float() + + if torch.cuda.is_available(): + out = model.custom_forward(current_input.cuda()) + out_choice = torch.argmax(out) + # if epoch == 0: + + t1 = current_input[:num_bugs * (num_bugs - 1)] + + else: + out = model.custom_forward(current_input) + out_choice = torch.argmax(out) + t1 = current_input[:num_bugs * (num_bugs - 1)] + + t2 = torch.from_numpy(cf_to_lower_triangular_flattened(t_test[index])).float() + + print(f"Action chosen: {out_choice.item()}\n\n") + if torch.equal(t1, t2): + successes += 1 + # steps_correct.append(actions) + break + + current_matrix[out_choice] = 1 if current_matrix[out_choice] == 0 else 0 + actions += 1 + current_input = np.concatenate((current_matrix, rand_sample)) + + print("----------------------") + print("----------------------") + print(f"Solved {successes}/{len(play_out_indices)} Challenges.") + + +def start_rollout( + n_bugs: int, + multiple_actions: bool, + model_path: str, + n_rollouts: Optional[int] = 10, + zero_rollout: Optional[bool] = False, + config: Optional[dict] = None +): + """ + This method starts a rollout on the given model. + + :param n_bugs: number of bugs + :param multiple_actions: if True, the algorithm can take one of multiple delete actions in a single step. + :param model_path: Absolute path to the model that is used for the rollout + :param n_rollouts: Number of rollouts + :param zero_rollout: if true, start each rollout with an empty CF matrix + :param config: config dictionary for the custom_torch_fcnn + :return: None + """ + random.seed(10) + + num_outputs = (2 * n_bugs ** 2) // 2 - n_bugs + + if not config: + config = { + 'custom_model': 'custom_torch_fcnn', + 'custom_model_config': { + 'fcnet_hiddens': [256, 256, 256], + 'fcnet_activation': torch.nn.ReLU, + 'no_final_layer': False, + 'vf_share_layers': False, + 'free_log_std': False + } + } + + observation_space = gym.spaces.Box( + low=0, + high=2, + shape=(1, (2 * n_bugs ** 2) // 2 - n_bugs + (2 ** n_bugs // 2) * 2 * n_bugs) + ) + + action_space = gym.spaces.Discrete(((2 * n_bugs ** 2) // 2 - n_bugs)) + net = FullyConnectedNetwork(obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, + model_config=config, name="default_model") + + x, y = read_samples(n_bugs, multiple_actions=multiple_actions) + t = read_programs(n_bugs, multiple_actions=multiple_actions) + + test_indices = random.sample(range(0, len(x)), int(len(x) * 0.2)) + x_test = np.take(x, test_indices, axis=0) + y_test = np.take(y, test_indices, axis=0) + t_test = np.take(t, test_indices, axis=0) + + x = np.delete(x, test_indices, axis=0) + x = torch.from_numpy(x).float() + x_test = torch.from_numpy(x_test).float() + y_test = torch.from_numpy(y_test).float() + + # load network + net.load_state_dict(torch.load(model_path)) + + play_out_indices = random.sample(range(len(x_test)), n_rollouts) + rollout(net, x_test, t_test, num_bugs=n_bugs, play_out_indices=play_out_indices, zero_rollout=zero_rollout) + + +if __name__ == "__main__": + random.seed(10) + + model_file_name: str = "RL_Pretraining_Model_KL_DIV_Training_3-Bugs--lr=0.001--batch_size=100" \ + "--Multiple_Actions=TrueFalseDisjoint_Functions=False" + + if not model_file_name: + raise ValueError("No model file name given.") + + # start a rollout with n_bugs for the given model in model_path + start_rollout( + n_bugs=3, + model_path=f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/model_weights/{model_file_name}", + multiple_actions=True, + zero_rollout=False, + ) diff --git a/reinforcement_learning/data/__init__.py b/reinforcement_learning/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/data/agent_checkpoints/__init__.py b/reinforcement_learning/data/agent_checkpoints/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/data/agent_checkpoints/bugbit/__init__.py b/reinforcement_learning/data/agent_checkpoints/bugbit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/data/model_weights/RL_Pretraining_Model_KL_DIV_Training_5-Bugs--lr=0.001--batchsize=100--Multiple_Actions=True b/reinforcement_learning/data/model_weights/RL_Pretraining_Model_KL_DIV_Training_5-Bugs--lr=0.001--batchsize=100--Multiple_Actions=True new file mode 100644 index 0000000..404f56e Binary files /dev/null and b/reinforcement_learning/data/model_weights/RL_Pretraining_Model_KL_DIV_Training_5-Bugs--lr=0.001--batchsize=100--Multiple_Actions=True differ diff --git a/reinforcement_learning/data/model_weights/__init__.py b/reinforcement_learning/data/model_weights/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/data/training_sets/__init__.py b/reinforcement_learning/data/training_sets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/data/training_sets/pretraining_training_sets/__init__.py b/reinforcement_learning/data/training_sets/pretraining_training_sets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/data/training_sets/rl_training_sets/__init__.py b/reinforcement_learning/data/training_sets/rl_training_sets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_5_16.pkl b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_5_16.pkl new file mode 100644 index 0000000..ae86379 Binary files /dev/null and b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_5_16.pkl differ diff --git a/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_6_32.pkl b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_6_32.pkl new file mode 100644 index 0000000..7412ef0 Binary files /dev/null and b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_6_32.pkl differ diff --git a/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_7_64.pkl b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_7_64.pkl new file mode 100644 index 0000000..f4f8663 Binary files /dev/null and b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_50000_7_64.pkl differ diff --git a/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_500_5_12.pkl b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_500_5_12.pkl new file mode 100644 index 0000000..1be39d0 Binary files /dev/null and b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_500_5_12.pkl differ diff --git a/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_500_5_16.pkl b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_500_5_16.pkl new file mode 100644 index 0000000..969ec96 Binary files /dev/null and b/reinforcement_learning/data/training_sets/rl_training_sets/rl_training_set_500_5_16.pkl differ diff --git a/reinforcement_learning/dataset_generators/__init__.py b/reinforcement_learning/dataset_generators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reinforcement_learning/dataset_generators/pretraining_dataset_generation.py b/reinforcement_learning/dataset_generators/pretraining_dataset_generation.py new file mode 100644 index 0000000..a5e06cb --- /dev/null +++ b/reinforcement_learning/dataset_generators/pretraining_dataset_generation.py @@ -0,0 +1,474 @@ +""" +This file contains relevant functions to create a dataset for pretraining the Reinforcement Learning Agent. +""" + +# standard library imports +from typing import Tuple, Optional +from copy import deepcopy +import random +import pickle +from itertools import chain, combinations +import warnings +import os + +# 3rd party imports +import numpy as np +from tqdm import tqdm + +# local imports (i.e. our own code) +# noinspection PyUnresolvedReferences +from utilities import utilities +from dataset_generators.utils import generate_control_flow_matrix_and_specification, cf_to_lower_triangular_flattened + + +def solver(inputs: np.ndarray, outputs: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray]: + """ + Generates a control flow matrix and the steps the solver takes for a given set of inputs and outputs + + :param inputs: list of specification inputs + :param outputs: list of specification outputs + :return: control flow matrix, solver steps + """ + + bits = len(inputs[0]) + + cf_matrix = np.zeros(shape=(bits, 2 * bits), dtype=np.int64) + + algorithm_steps = [] + + # the algorithm: + for i, input_spec in enumerate(inputs): + for j, input_elem in enumerate(input_spec[:-1]): + if input_elem - outputs[i][j] != 0: + for k, spec_elem in enumerate(input_spec): + if spec_elem - outputs[i][k] != 0 and k > j: + cf_matrix[k, 2 * j + 1 - input_spec[j]] = 1 + if [k, 2 * j + 1 - input_spec[j]] not in algorithm_steps: + algorithm_steps.append([k, 2 * j + 1 - input_spec[j]]) + break + + return cf_matrix, np.array(algorithm_steps) + + +def generate_sample( + n_bugs: Optional[int] = 5, + multiple_actions=False, + reduced_modification_percentage: float = 0 +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list]: + """ + Generates training samples for a given number of bugs. All training samples have the same input-output + pairs but different control flow matrix modifications. It then returns the generated program (target) + by the solver, the order of steps to create this program by the algorithm, the input output pairs + and all the control flow matrix modifications. + + :param n_bugs: number of bugs + :param multiple_actions: if True, the algorithm can take one of multiple delete actions in a single step. + :param reduced_modification_percentage: percentage of unused control flow matrix modifications. + If 0, all possible modifications are returned. + + :return: target program, algorithm steps, input output pairs, control flow matrix modifications + """ + ins: np.ndarray + outs: np.ndarray + prog: np.ndarray + ins, outs, prog = generate_control_flow_matrix_and_specification(n_bugs=n_bugs) + + # choose subset (half) of all input-output pairs + choice = random.sample(range(ins.shape[0]), len(ins) // 2) + ins = ins[choice, :] + outs = outs[choice, :] + + algorithm_steps: np.ndarray + prog, algorithm_steps = solver(inputs=ins, outputs=outs) + + # detect all partial solutions where at least one 1 has to be added + s = list(np.argwhere(prog)) + powerset = chain.from_iterable(combinations(s, r) for r in range(len(s) + 1)) + + res = [] + + for _, elem in enumerate(powerset): + a = np.zeros(shape=np.shape(prog), dtype=np.int64) + for _, index in enumerate(elem): + a[index[0], index[1]] = 1 + res.append(a) + + # remove as many modifications as specified by the percentage + if reduced_modification_percentage > 0: + idx_list = random.sample(range(0, len(res)), + int(len(res) * reduced_modification_percentage) + ) # remove 95% of samples for each function + res_b = [elem for index, elem in enumerate(res) if index not in idx_list] + res = res_b + + # create modifications where at least one 1 is added randomly to the respective CF matrix + modifications = additive_modifications(np.array(res), prog=prog, multiple_actions=multiple_actions) + return prog, algorithm_steps, ins, outs, modifications + + +def create_training_samples(n_bugs: int, multiple_actions: Optional[bool] = False, + reduced_modification_percentage: Optional[float] = 0.0): + """ + Generates training samples for a given number of bugs. All training samples have the same input-output + pairs but different control flow matrices. For each training sample, it creates a target probability + vector, depending on which action to take next when following the expert algorithm. The probability + vector is different, depending on whether an edge has to be added or deleted next. + For each sample, it also returns the corresponding target matrix by the solver. + + :param n_bugs: number of bugs + :param multiple_actions: if True, the algorithm can take one of multiple delete actions in a single step. + :param reduced_modification_percentage: percentage of unused control flow matrix modifications. + :return: x_samples, y_samples, sample_types, target program + """ + + x_samples = [] + y_samples = [] + sample_types = [] + ts = [] + + # generate training samples for one input-output pair + t, algorithm_steps, ins, outs, modifications = generate_sample( + n_bugs=n_bugs, + multiple_actions=multiple_actions, + reduced_modification_percentage=reduced_modification_percentage + ) + + # create label vectors for each training sample + for pre in modifications: + label = np.zeros(((n_bugs * 2 * n_bugs) // 2 - n_bugs,)) + + diff: np.array = t - pre + + # Check + if -1 in diff: + indices = np.argwhere(diff == -1) + # how many Ones could we remove + ahead = len(indices) + for index in indices: + # equally distribute the probability mass over all possible deletions + label[get_new_index(n_bugs, index)] = 1 / ahead + sample_type = -1 + elif 1 in diff: + indices = np.argwhere(diff == 1) + sample_type = 1 + set_one = False + for coord in algorithm_steps: + for ind in indices: + if np.array_equal(coord, ind): + # put all weight on next add action of solving algorithm + label[get_new_index(n_bugs, coord)] = 1 + set_one = True + break + if set_one: + break + else: + raise AssertionError("Pre Matrix equals target Matrix") + + pre1 = cf_to_lower_triangular_flattened(cf_matrix=pre) + + ins1 = np.array(ins).flatten() + outs1 = np.array(outs).flatten() + x1 = np.concatenate((pre1, ins1, outs1)) + label = label.flatten() + x_samples.append(x1) + y_samples.append(label) + sample_types.append(sample_type) + ts.append(t) + return x_samples, y_samples, sample_types, ts + + +def additive_modifications( + history: np.ndarray, + n_modifications: Optional[int] = 1, + multiple_actions: Optional[bool] = False, + prog=None +) -> list: + """ + Returns additive modifications of the history created by the solving algorithm + + :param prog: The target CF matrix found by the algorithm + :param history: in-template history + :param n_modifications: how many modifications to produce for each history element + :param multiple_actions: whether we want to produce training samples, where more than one action is needed to + get back on template or not + :return: array of arrays of original history elements and their additive modifications. The originals + are the 0th elements in each array + """ + + warnings.warn( + message="additive_modifications() (so far) does not check whether " + "a generated additive modification leads to a on sample matrix", + category=FutureWarning, + stacklevel=1) + + res: list = [] + + possible_coordinate_mappings = get_valid_coordinates(n_bugs=len(history[0])) + + for _, hist_elem in enumerate(history): + if not np.array_equal(prog, hist_elem): + res.append(hist_elem) + for n in range(n_modifications): + tmp = deepcopy(hist_elem) + z = 1 + if multiple_actions: + # Flip up to 3 bits in the CF matrix depending on the given probabilities + z = np.random.choice([1, 2, 3], 1, p=[0.5, 0.3, 0.2])[0] + for _ in range(z): + idx = np.random.choice(len(possible_coordinate_mappings)) + tmp[possible_coordinate_mappings[idx]] = 1 + if not np.array_equal(prog, tmp): + res.append(tmp) + + return res + + +def get_new_index(n_bugs: int, coordinate: Tuple[int, int]) -> int: + """ + Gets a coordinate of a CF matrix entry and return its corresponding index in the target label, if + the control flow matrix is a lower triangle matrix (waterfall principle). + + :param n_bugs: Number of Bugs + :param coordinate: A coordinate (x,y) of a matrix + :return: int: Element index if relevant CF matrix entries are sorted line by line + """ + + c_normal = coordinate[0] * 2 * n_bugs + coordinate[1] + new_index = c_normal + for i in range(coordinate[0]): + new_index = new_index - (2 * n_bugs - 2 * i) + return new_index + + +def get_valid_coordinates(n_bugs: int) -> dict: + """ + Returns a dict that maps the new index of a coordinate in the lower triangle matrix + + to the original coordinate + :param n_bugs: Number of Bugs + :return: dict: Maps the new index of a coordinate in the lower triangle matrix to a valid x,y coordinate pair + """ + + # a dict containing sorted relevant CF coordinates + transformations = dict() + for i in range(n_bugs): + for j in range(2 * n_bugs): + if j >= 2 * i: + continue + transformations[get_new_index(n_bugs, (i, j))] = (i, j) + return transformations + + +def write_samples_to_file( + num_bugs: int, + x_samples: np.ndarray, + y_samples: np.ndarray, + sample_types: np.ndarray, + programs: np.ndarray, + multiple_actions: Optional[bool] = False, + verbose: Optional[bool] = False +): + """ + Writes samples to file. The samples are written to a file in the following format: + + :param num_bugs: Number of Bugs + :param x_samples: Training Samples (CF Matrix, Inputs, Outputs respectively) + :param y_samples: Target Labels + :param sample_types: Sample Types + :param programs: Programs + :param multiple_actions: Whether we want to produce training samples, where more than one delete action is possible + :param verbose: Whether we want to print the progress of the writing process + :return: None + """ + file_path = f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/training_sets/pretraining_training_sets" + x_file_name = f"{file_path}/X_TrainingSet{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + y_file_name = f"{file_path}/Y_TrainingSet{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + type_file_name = f"{file_path}/SampleTypes{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + program_file_name = f"{file_path}/Programs{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + + for file, samples in [ + (x_file_name, x_samples), + (y_file_name, y_samples), + (type_file_name, sample_types), + (program_file_name, programs) + ]: + with open(file, 'wb') as f: + pickle.dump(samples, f, protocol=pickle.HIGHEST_PROTOCOL) + + if verbose: + print("\nWrote ", len(x_samples), "training samples to Files.") + + values, counts = np.unique(sample_types, return_counts=True) + type0 = "REMOVE_EDGES" if values[0] == -1 else "ADD_EDGES" + type1 = "ADD_EDGES" if type0 == "REMOVE_EDGES" else "REMOVE_EDGES" + print(f"{counts[0]} ({round((counts[0] / len(x_samples)) * 100, 2)}%) samples are of type '{type0}'.") + print(f"{counts[1]} ({round((counts[1] / len(x_samples)) * 100, 2)}%) samples are of type '{type1}'.") + + +def read_samples( + num_bugs: int, + multiple_actions: Optional[bool] = False, + verbose: Optional[bool] = False +): + """ + Reads x,y samples from file. The samples are read from a file in the following format: + + :param num_bugs: Number of Bugs + :param multiple_actions: Whether we want to use training samples, where more than one delete action is possible + :param verbose: Whether we want to print the progress of the reading process + :return: None + """ + + file_path = f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/training_sets/pretraining_training_sets" + x_file_name = f"{file_path}/X_TrainingSet{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + y_file_name = f"{file_path}/Y_TrainingSet{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + + with open(x_file_name, 'rb') as f: + x_samples = pickle.load(f) + + with open(y_file_name, 'rb') as f: + y_samples = pickle.load(f) + + if verbose: + print("\nRead ", len(x_samples), "training samples from Files.") + return x_samples, y_samples + + +def read_sample_types( + num_bugs: int, + multiple_actions: Optional[bool] = False, + verbose: Optional[bool] = False +): + """ + Reads the sample types from the given file. A sample type can be either -1 or 1. -1 means that this sample is + a sample, where the corresponding PRE matrix has at least one 1 too much. 1 means that the PRE matrix does + not have unnecessary/wrong ones, but at least one 1 has to be added. + + :param num_bugs: Number of Bugs + :param multiple_actions: Whether we want to use training samples, where more than one delete action is possible. + :param verbose: Whether we want to print the progress of the reading process + :return: Sample Types + """ + + file_path = f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/training_sets/pretraining_training_sets" + type_file_name = f"{file_path}/SampleTypes{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + + with open(type_file_name, "rb") as f: + sample_types = pickle.load(f) + + if verbose: + print("\nRead Sample Types.") + values, counts = np.unique(sample_types, return_counts=True) + type0 = "REMOVE_EDGES" if values[0] == -1 else "ADD_EDGES" + type1 = "ADD_EDGES" if type0 == "REMOVE_EDGES" else "REMOVE_EDGES" + print(f"{counts[0]} ({round((counts[0] / len(sample_types)) * 100, 2)}%) samples are of type '{type0}'.") + print(f"{counts[1]} ({round((counts[1] / len(sample_types)) * 100, 2)}%) samples are of type '{type1}'.") + + return sample_types + + +def read_programs( + num_bugs: int, + multiple_actions: Optional[bool] = False, + verbose: Optional[bool] = False +): + """ + Reads the programs from the given file. A Program is the target matrix for the corresponding input output pairs + found by the algorithm. + + :param num_bugs: Number of Bugs + :param multiple_actions: Whether we want to use training samples, where more than one delete action is possible. + :param verbose: Whether we want to print the progress of the reading process + :return: Programs + """ + file_path = f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/training_sets/pretraining_training_sets" + type_file_name = f"{file_path}/Programs{num_bugs}{('multiple_actions' if multiple_actions else '')}.pkl" + + with open(type_file_name, "rb") as f: + programs = pickle.load(f) + + if verbose: + print(f"\nRead {len(programs)} Programs.") + + return programs + + +def create_pretraining_dataset( + num_bugs: int, + multiple_actions: Optional[bool] = False, + reduced_modification_percentage: Optional[float] = 0.0, + upper_bound_size: Optional[int] = 500000, + verbose: Optional[bool] = False +): + """ + Creates a full pretraining dataset for pretraining the Reinforcement Learning Agent. + The dataset consists of training samples, of which each has a control flow matrix and input-output pairs. + Pretraining is done in a supervised way with label vectors determining the desired action the agent should take. + + :param num_bugs: Number of Bugs + :param multiple_actions: Whether we want to use training samples, where more than one delete action is possible. + :param reduced_modification_percentage: Percentage of samples, which are randomly removed from the dataset. If the + value is high, this increases the number of different input-output pairs in the dataset. + :param upper_bound_size: Maximum size of the dataset. + :param verbose: Whether we want to print the progress of the creation process. + :return: None + """ + + x_samples = [] + y_samples = [] + sample_types = [] + programs = [] + for i in tqdm(range(200000)): + x_sample, y_sample, sample_type, t = create_training_samples( + n_bugs=num_bugs, + multiple_actions=multiple_actions, + reduced_modification_percentage=reduced_modification_percentage + ) + + x_samples += x_sample + y_samples += y_sample + sample_types += sample_type + programs += t + + if len(x_samples) > upper_bound_size: + print("Specifications tried: ", i) + print("Samples: ", len(x_samples)) + break + + x_samples = np.array(x_samples, dtype=float) + y_samples = np.array(y_samples, dtype=float) + + print("Complete Size in Bytes: ", x_samples.size * x_samples.itemsize) + + # we don't want duplicates in our training set + x_samples, lbl_indices = np.unique(x_samples, return_index=True, axis=0) + y_samples = np.take(y_samples, lbl_indices, axis=0) + sample_types = np.take(sample_types, lbl_indices, axis=0) + all_programs = np.take(programs, lbl_indices, axis=0) + print("Size after removing Duplicates: ", len(x_samples)) + + # how many functions do we have? + number_of_functions = np.unique(x_samples[:, num_bugs * (num_bugs - 1):], return_index=False, axis=0) + print("Number of Functions: ", len(number_of_functions)) + + # write Training set into files + write_samples_to_file( + num_bugs=num_bugs, + x_samples=x_samples, + y_samples=y_samples, + sample_types=sample_types, + programs=all_programs, + multiple_actions=multiple_actions, + verbose=verbose + ) + + +if __name__ == "__main__": + random.seed(10) + create_pretraining_dataset( + num_bugs=3, + multiple_actions=True, + reduced_modification_percentage=0.95, + upper_bound_size=10000, + verbose=True + ) diff --git a/reinforcement_learning/dataset_generators/rl_trainingset_generation.py b/reinforcement_learning/dataset_generators/rl_trainingset_generation.py new file mode 100644 index 0000000..69fbb07 --- /dev/null +++ b/reinforcement_learning/dataset_generators/rl_trainingset_generation.py @@ -0,0 +1,193 @@ +""" +This file is used to generate the training set / training samples for RL +""" + +import os +from typing import Optional +import random +import time +from copy import deepcopy + +# 3rd party imports +import pandas as pd +import numpy as np + +# local imports (i.e. our own code) +# noinspection PyUnresolvedReferences +from utilities import utilities +from dataset_generators.utils import generate_control_flow_matrix_and_specification, get_outputs, \ + cf_to_lower_triangular_flattened, flattened_repr_to_control_flow_matrix + + +def generate_rl_training_set( + size: Optional[int] = 100, + n_bugs: Optional[int] = 5, + sample_size: Optional[int] = 0, + pickle: Optional[bool] = True +) -> pd.DataFrame: + """ + Generates a training set for the RL algorithm + + :param pickle: whether to persist the training set as a pickle file + :param size: of the training set + :param n_bugs: number of bugs to be used + :param sample_size: if 0: half of the specification size, else: sample_size number of specification pairs + :return: pd.DataFrame(columns=["distance","input_samples", "output_samples", "modified_control_flow_matrix"]) + """ + training_set = pd.DataFrame( + columns=[ + "distance", + "input_samples", + "output_samples", + "modified_control_flow_matrix" + ] + ) + + # time the training set generation and print the runtime + start = time.time() + + # while the training set is not full + while len(training_set) < size: + # generate a random program and the full specification + ins, outs, prog = generate_control_flow_matrix_and_specification(n_bugs=n_bugs) + + # if sample size not specified (i.e. 0), take exactly half of the full + sample_size = int(((2 ** n_bugs) / 2) if sample_size == 0 else sample_size) + + # take a random subsample of the full specification + sample_choice = random.sample(range(ins.shape[0]), sample_size) + + ins = ins[sample_choice, :] + outs = outs[sample_choice, :] + + # convert the matrix to the vector representation + flat_matrix = deepcopy(cf_to_lower_triangular_flattened(prog)) + subtractive_modifications = [] + queue = [(0, flat_matrix)] + + # generate subtractive modifications + while queue: + distance, current_element = queue.pop() + edge_indices = np.argwhere(current_element == 1) + local_results = [] + + for index in edge_indices: + tmp = deepcopy(current_element) + tmp[index] = 0 + local_results.append(tmp) + + for modification in local_results: + if subtractive_modifications: + appears = False + # check if the modification is already in the list + for _, elem in subtractive_modifications: + if np.array_equal(modification, elem): + appears = True + # if it is not in the list, add it to the queue and the list + if not appears: + queue.append((distance + 1, modification)) + subtractive_modifications.append((distance + 1, modification)) + else: + queue.append((distance + 1, modification)) + subtractive_modifications.append((distance + 1, modification)) + + # generate additive modifications based on subtractive_modifications + additive_modifications = [] + + for distance, matrix in subtractive_modifications: + # take random zero index and add one to it + zero_indices = np.argwhere(matrix == 0) + + local_results = [] + + for index in zero_indices: + tmp = deepcopy(matrix) + tmp[index] = 1 + local_results.append(tmp) + + # check that the additive modifications are not in additive_modifications or subtractive_modifications + for modification in local_results: + appears = False + for _, elem in additive_modifications: + if np.array_equal(modification, elem): + appears = True + if not appears: + for _, elem in subtractive_modifications: + if np.array_equal(modification, elem): + appears = True + if not appears and not np.array_equal(modification, flat_matrix): + additive_modifications.append((distance + 1, modification)) + + # generate additive modifications based on the original program + flat_matrix = deepcopy(cf_to_lower_triangular_flattened(prog)) + original_additive_modifications = [] + queue = [(0, flat_matrix)] + + while queue and len(original_additive_modifications) < 30: + distance, current_element = queue.pop() + no_edge_indices = np.argwhere(current_element == 0) + local_results = [] + + for index in no_edge_indices: + tmp = deepcopy(current_element) + tmp[index] = 1 + local_results.append(tmp) + + for modification in local_results: + + appears = False + # check if the modification is already in the list + for _, elem in original_additive_modifications: + if np.array_equal(modification, elem): + appears = True + # check if it is already in the additive_modifications list + if not appears: + for _, elem in additive_modifications: + if np.array_equal(modification, elem): + appears = True + + # if it is not in the list, add it to the queue and the list + if not appears: + queue.append((distance + 1, modification)) + original_additive_modifications.append((distance + 1, modification)) + + # merge subtractive_modifications, additive_modifications, original_additive_modifications into one list + modifications = subtractive_modifications + additive_modifications + original_additive_modifications + results = [] + + # check if the modifications generate the same outputs as the original + # program for the sample of the specification + for distance, elem in modifications: + mod_outs = get_outputs( + n_bugs=n_bugs, + ins=ins, + prog=flattened_repr_to_control_flow_matrix( + flat_cf_repr=elem, + n_bugs=n_bugs) + ) + # check if mod_outs and outs are the same + if not np.array_equal(a1=outs, a2=mod_outs): + results.append((distance, elem)) + + # add the results that do not generate the same outputs as the original program to the training set + for distance, elem in results: + training_set = training_set.append(pd.DataFrame({ + "distance": distance, + "input_samples": [ins], + "output_samples": [outs], + "modified_control_flow_matrix": [elem] + }), ignore_index=True) + + end = time.time() + print("RL training set generation took {} seconds".format(end - start)) + if pickle: + training_set.to_pickle( + f"{os.getenv('REINFORCEMENT_LEARNING_DIR')}/data/training_sets/rl_training_sets/rl_training_set_" + f"{size}_{n_bugs}_{sample_size}.pkl" + ) + return training_set + + +if __name__ == "__main__": + random.seed(10) + generate_rl_training_set(size=10000, n_bugs=3) diff --git a/reinforcement_learning/dataset_generators/utils.py b/reinforcement_learning/dataset_generators/utils.py new file mode 100644 index 0000000..61884f0 --- /dev/null +++ b/reinforcement_learning/dataset_generators/utils.py @@ -0,0 +1,146 @@ +""" +This file contains useful functions to create the datasets for pretraining as well as for the reinforcement learning. +""" +# standard library imports +from typing import List, Tuple, Optional + +from numpy import binary_repr +import numpy as np +# noinspection PyPackageRequirements +import jpype +# noinspection PyUnresolvedReferences +from utilities import utilities + +CF_Translated = jpype.JClass( + "de.bugplus.examples.development.CF_Translated" +) + + +def generate_control_flow_matrix_and_specification(n_bugs: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Generates a control flow matrix and the corresponding specification + + :param n_bugs: number of bugs + :return: + """ + + prog = generate_control_flow_sequentially(n_bugs=n_bugs) + + ins: np.ndarray = generate_ins(n_bugs=n_bugs) + outs = [] + + for _, elem in enumerate(ins): + outs.append(np.array(CF_Translated.execute(n_bugs, prog, elem), dtype=np.int64)) + return ins, np.array(outs), prog + + +def get_outputs(n_bugs: int, ins: np.ndarray, prog: np.ndarray) -> List[np.ndarray]: + """ + For a given input and program, returns the outputs of the program + + :param n_bugs: number of bugs of the program + :param ins: program inputs + :param prog: control flow matrix of the program + :return: program outputs of len(ins) + """ + outs = [] + for _, elem in enumerate(ins): + outs.append(np.array(CF_Translated.execute(n_bugs, prog, elem), dtype=np.int64)) + return outs + + +def generate_ins(n_bugs: int) -> np.ndarray: + """ + Generates all possible inputs for the generator for a given number of bugs + + :param n_bugs: number of bugs the program will have + :return: ndarray of all possible inputs + """ + return np.array( + [ + np.array([int(bit) for bit in binary_repr(num=n, width=n_bugs)], dtype=np.int64) + for n in range(2 ** n_bugs) + ] + ) + + +def generate_control_flow_sequentially(n_bugs: int, num_edges: Optional[int] = None) -> np.ndarray: + """ + returns a (sequentially) generated control flow matrix for turing tumble / BugBit + + :param n_bugs: number of bugs used + :param num_edges: number of edges to be used (OPTIONAL) + :return: generated control flow matrix (NDArray) + """ + connected_bugs = [0] + + cf = np.zeros(shape=(2 * n_bugs, n_bugs), dtype=np.int64) + + sink = -3 + + num_edges = 2 * (n_bugs - 1) if not num_edges else num_edges + + while connected_bugs and num_edges != 0: + current_bug = connected_bugs[0] + + options = list(range(current_bug + 1, n_bugs)) + [sink] + + choice_a, choice_b = np.random.choice(a=options, size=2) + + if choice_a != sink: + cf[current_bug * 2, choice_a] = 1 + num_edges -= 1 + if choice_a not in connected_bugs: + connected_bugs.append(choice_a) + + if choice_b != sink and num_edges != 0: + cf[current_bug * 2 + 1, choice_b] = 1 + num_edges -= 1 + if choice_b not in connected_bugs: + connected_bugs.append(choice_b) + + connected_bugs.remove(current_bug) + + return cf.T + + +def flattened_repr_to_control_flow_matrix(flat_cf_repr: np.ndarray, n_bugs: int) -> np.ndarray: + """ + Converts a flattened representation of the control flow matrix to a control flow matrix + + :param flat_cf_repr: + :param n_bugs: + :return: + """ + cf_new = np.zeros(shape=(n_bugs, 2 * n_bugs), dtype=np.int64) + + i = 0 + for rowindex in range(n_bugs): + # remove the irrelevant CF Matrix elements (waterflow principle, upper right corner) + row_add_indices = list(range(2 * rowindex)) + + for rai in row_add_indices: + # print(rowindex, rai, i) + cf_new[rowindex, rai] = flat_cf_repr[i] + i += 1 + return cf_new + + +def cf_to_lower_triangular_flattened(cf_matrix: np.ndarray) -> np.ndarray: + """ + Converts a control flow matrix to a lower triangular matrix represented as a vector + + :param cf_matrix: + :return: + """ + n_bugs = len(cf_matrix) + + remove_indices = [] + + for rowindex in range(len(cf_matrix)): + row_remove_indices = list(range(2 * rowindex + rowindex * 2 * n_bugs, 2 * n_bugs * (rowindex + 1))) + remove_indices += row_remove_indices + + cf_matrix = np.array(cf_matrix).flatten() + + return np.delete(cf_matrix, remove_indices) diff --git a/reinforcement_learning/environments/__init__.py b/reinforcement_learning/environments/__init__.py new file mode 100644 index 0000000..6dc1067 --- /dev/null +++ b/reinforcement_learning/environments/__init__.py @@ -0,0 +1,11 @@ +from gym.envs.registration import register + +register( + id="bugbit-v0", + entry_point="environments.envs:BugBit_v0", +) + +register( + id="connectfour-v0", + entry_point="environments.envs:ConnectFourMVC", +) \ No newline at end of file diff --git a/reinforcement_learning/environments/envs/__init__.py b/reinforcement_learning/environments/envs/__init__.py new file mode 100644 index 0000000..ddbcebb --- /dev/null +++ b/reinforcement_learning/environments/envs/__init__.py @@ -0,0 +1,2 @@ +from environments.envs.bugbit_env import BugBit +from environments.envs.connectfourmvc_env import ConnectFourMVC diff --git a/reinforcement_learning/environments/envs/bugbit_env.py b/reinforcement_learning/environments/envs/bugbit_env.py new file mode 100644 index 0000000..a75f980 --- /dev/null +++ b/reinforcement_learning/environments/envs/bugbit_env.py @@ -0,0 +1,208 @@ +""" +BugBit environment for RL training. Abstraction of the Turing Tumble game. +""" + +# standard library imports +from copy import deepcopy +from typing import Dict, Any + +# 3rd party imports +import pandas as pd +import gym +from gym.utils import seeding + +import numpy as np + +# local imports (i.e. our own code) +# noinspection PyUnresolvedReferences +from utilities import utilities +from dataset_generators.utils import flattened_repr_to_control_flow_matrix, get_outputs + + +# noinspection PyMethodMayBeStatic +class BugBit(gym.Env): + + def __init__(self, config: dict): + """ + Initialises the environment. + + :param config: dictionary containing the config for the environment + :return: None + """ + super().__init__() + self.np_random = None + self.verbose: bool = False + + self.reward: int = 0 + self.done = False + self.state = None + self.info: dict = {} + self.n_bugs: int = 3 + self.config = config + self.sample_size: int = 4 + self.training_set: pd.DataFrame = pd.DataFrame() + self.phase: int = 1 + # self.generator: Generator = Generator() + self.step_counter: int = 0 + self.max_steps: int = 15 + + self.parse_config(self.config) + + self.action_space = gym.spaces.Discrete(((2 * self.n_bugs ** 2) // 2 - self.n_bugs)) + + # Dictionary containing the state of the environment. Contains the control flow matrix, the input samples, + # and the output samples. + self.observation_space = gym.spaces.Dict( + { + "control_flow_matrix": gym.spaces.Box( + low=0, + high=1, + shape=(self.n_bugs * (self.n_bugs - 1),), dtype=np.int64 + ), + "sample_input_pairs": gym.spaces.Box( + low=0, + high=1, + shape=(self.sample_size, self.n_bugs), + dtype=np.int64 + ), + "sample_output_pairs": gym.spaces.Box( + low=0, + high=1, + shape=(self.sample_size, self.n_bugs), + dtype=np.int64 + ) + } + ) + + def reset(self) -> Dict[str, Any]: + """ + Resets the environment and returns the reset state. + :return: + """ + + self.done = False + self.reward = 0 + self.info = dict() + self.step_counter = 0 + + row = self._sample_from_training_set() + + self.state = { + "control_flow_matrix": row["modified_control_flow_matrix"].values[0], + "sample_input_pairs": row["input_samples"].values[0], + "sample_output_pairs": row["output_samples"].values[0] + } + + return self.state + + def step(self, action: int): + """ + Step function of the environment + :param action: edge in the control flow matrix to be set/unset + :return: + """ + # 1. Check if the maximum number of steps has reached and, if so, end the game + self.step_counter += 1 + if self.step_counter > self.max_steps: + self.reward = -1 + self.done = True + self.info = { + "won": 0 + } + return [self.state, self.reward, self.done, self.info] + + # 2. Take the action the agent selected (i.e. set/unset an edge) + cf_matrix = deepcopy(self.state["control_flow_matrix"]) + + cf_matrix[action] = 1 if cf_matrix[action] == 0 else 0 + + # 3. Get the outputs for the corresponding input pairs for the (now) modified control flow matrix + current_outs = get_outputs( + n_bugs=self.n_bugs, + ins=self.state["sample_input_pairs"], + prog=flattened_repr_to_control_flow_matrix(flat_cf_repr=cf_matrix, n_bugs=self.n_bugs) + ) + # 4. Update the state + self.state = { + "control_flow_matrix": cf_matrix, + "sample_input_pairs": self.state["sample_input_pairs"], + "sample_output_pairs": self.state["sample_output_pairs"] + } + + # 5. If the outputs in the observations space equal the ones of the updated control flow matrix + # return a positive reward and end the game + if np.array_equal(current_outs, self.state["sample_output_pairs"]): + self.reward = 1 + self.done = True + self.info = { + "won": 1 + } + # 6. else keep the reward at 0 and let the game continue + else: + self.reward = -1 + self.done = False + return [self.state, self.reward, self.done, self.info] + + def render(self, mode: str = "human"): + """ + Render method of the environment. Not implemented for bugbit. + :param mode: + :return: None + """ + raise NotImplementedError("render() is not implemented") + + def parse_config(self, config: dict): + """ + Parses the config dictionary that is passed in __init__ + :param config: dictionary containing the config for the environment + :return: None + """ + self.config = config + self.n_bugs = config.get("n_bugs") + self.training_set = config.get("training_set") + self.max_steps = config.get("max_steps") + self.sample_size = config.get("sample_size") + + def _sample_from_training_set(self) -> pd.Series: + """ + Takes a random sample from the training set depending on the phase the environment is set to + :return: + """ + return self.training_set.loc[self.training_set["distance"] == self.phase].sample(n=1) + + def increment_phase(self): + """ + Set the phase (i.e. difficulty for curriculum learning) of the environment. + Also increases the maximum number of steps for the next phase. + Follows: https://docs.ray.io/en/releases-1.3.0/rllib-training.html#curriculum-learning + :return: None + """ + if self.phase < self.training_set["distance"].max(): + self.phase += 1 + self.max_steps += 1 + print("Phase incremented to: ", self.phase) + else: + print("Maximum phase reached!") + + def seed(self, seed=None): + """Sets the seed for this env's random number generator(s). + Note: + Some environments use multiple pseudorandom number generators. + We want to capture all such seeds used in order to ensure that + there aren't accidental correlations between multiple generators. + Returns: + list