Skip to content

Commit

Permalink
Merge pull request #11 from Aurora-tq/test
Browse files Browse the repository at this point in the history
  • Loading branch information
garylin2099 authored May 30, 2024
2 parents 10d7da5 + 666acd8 commit aa89dc0
Show file tree
Hide file tree
Showing 10 changed files with 946 additions and 0 deletions.
30 changes: 30 additions & 0 deletions examples/test_experiments/README_DI.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
## How to use?
1.step1: run kaggle_to_analysis.py

2.step2: run first_analysis_to_code.py

3.step3: run code_to_analysis.py

4.step4: run random_sample_analysis_with_baseline.py

5.step5: run insights_to_code.py

## The Result
1.After the first step, you will obtain manual experience extracted from Kaggle.

2.After the second step, you will utilize the experience from Kaggle to allow DI to generate the first round of 10 samples.

3.After the third step, you will obtain the experience from the first round of samples.

4.After the fourth step, you will obtain insights combined with the baseline code.

5.After the fifth step, you will obtain new code samples generated based on insights from different task types, which will be the samples for the second round.

## Note
1.Need to modify the 'import' statements in the following file: ~/MetaGPT/metagpt/strategy/task_type.py

from metagpt.prompts.task_type import...

from metagpt.prompts.task_type_new import...


163 changes: 163 additions & 0 deletions examples/test_experiments/code_to_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2023/5/6 14:13
@Author : alexanderwu
@File : llm_hello_world.py
"""
import asyncio
import json
import os
import re
import shutil
from pathlib import Path

from metagpt.llm import LLM
from metagpt.logs import logger
from metagpt.schema import Message
from examples.test_experiments import plan_to_code

STRUCTUAL_PROMPT = """
[User Requirement]
{user_requirement}
[code example]
Here is an example of the code:
{code}
[score]
The score of the example code is: {score}
[instruction]
**Summarize**:
[
"Analyze the data preprocessing and feature selection methods used in the code. Focus on how these methods could potentially improve model performance.",
"List the specific features processed and discuss their contribution to the model's effectiveness.",
"Conclude with a brief rationale on the improvement in model performance due to these methods.",
"Your analysis must be listed in bullet points, with a minimum of 3-5 points(e.g.,1.). Ensure that each point is concise and straightforward.",
]
Output a JSON in the following format:
```json
{{
"metric": "Report the value of the score.",
"lower_is_better": "true if the metric should be minimized (e.g., MSE), false if it should be maximized (e.g., accuracy).",
"Analysis": "Based on the [User Requirement], this concise analysis focuses on the key data processing and feature engineering methods outlined in the code, detailing their potential impact on improving model performance."
}}
"""

SUMMARIZE_PROMPT = """
[analysis]
{analysis}
[instruction]
**Summarize**:
[
"From the detailed [analysis] of data preprocessing and feature engineering provided, extract and summarize specific, actionable recommendations that could optimize these processes.",
"Include clear steps on how to implement these improvements and briefly describe their potential impact on model performance.",
"Your summarzie must be listed in bullet points, with a minimum of 3-5 points(e.g.,step 1.). Ensure that each point is concise and straightforward.",
]
Output a JSON in the following format:
```json
{{
"insights": "Based on the detailed [analysis] provided above on code data preprocessing and feature engineering, please summarize specific and actionable recommendations."
}}
"""

REFLECTION_SYSTEM_MSG = """You are an expert in machine learning. Please analyze the provided code and point out the valuable insights to consider."""


async def wrap_code(code: str, lang="python") -> str:
"""Wraps code with three backticks."""
return f"```{lang}\n{code}\n```"

async def save_rsp_to_file(rsp, filename):
with open(filename, "w") as file:
json.dump(rsp, file)

async def load_json_data(json_dir):
with open(json_dir, "r") as file:
json_data = json.load(file)
return json_data

async def clean_json_from_rsp(text):
pattern = r"```json(.*?)```"
matches = re.findall(pattern, text, re.DOTALL)
if matches:
json = "\n".join(matches) #
return json
else:
return ""

async def format_output(response_dir, response_path):
new_data = []
rsp_data = await load_json_data(response_path)
for i, item in enumerate(rsp_data["analysis"]):
item_dict = json.loads(item)
data = {"Analysis": item_dict, "id": i}
new_data.append(data)
new_file_path = Path(response_dir) / "response_data_format.json"
await save_rsp_to_file(new_data, new_file_path)
return new_file_path

async def summarize_insights(data):
llm = LLM()
analysis = data.get("analysis", [])
structual_prompt = SUMMARIZE_PROMPT.format(
analysis=analysis,
)
context = llm.format_msg([Message(content=structual_prompt, role="assistant")])
llm_response = await llm.aask(context, system_msgs=[REFLECTION_SYSTEM_MSG])
logger.info(llm_response)
rsp = await clean_json_from_rsp(llm_response)
return rsp

async def analysis_code(data,requirement:str):
llm = LLM()
code = data.get("code", [])
score = data.get("score", [])
structual_prompt = STRUCTUAL_PROMPT.format(
user_requirement=requirement,
code=await wrap_code(code),
score=score,
)
context = llm.format_msg([Message(content=structual_prompt, role="assistant")])

llm_response = await llm.aask(context, system_msgs=[REFLECTION_SYSTEM_MSG])
logger.info(llm_response)
rsp = await clean_json_from_rsp(llm_response)
return rsp


async def main(
score:list,
first_save_path:str,
requirement:str
):
dataset_name = "Titanic"
path = f"/Users/aurora/Desktop/test_1/{dataset_name}" # Replace with your path
# save_dir = first_save_path.parent
# print(save_dir)
new_plan_path = Path(path) / "First_sample/plan_DI"
save_directory = Path(path) / "First_sample/code_DI_new/"
await plan_to_code.process_code_from_plan(first_save_path, new_plan_path, score, save_directory)
# analysis rsp path
rsp_directory = Path(path) / "Analysis/Second_analysis_DI"
os.makedirs(rsp_directory, exist_ok=True)
response_file = Path(rsp_directory) / "response_data.json"
# code->analysis
analysis_list = []
for i in range(0, len(score)):
analyze_code_dir = Path(save_directory) / f"code_{i}.json"
analyze_data = await load_json_data(analyze_code_dir)
analysis_rsp = await analysis_code(analyze_data,requirement)
analysis_list.append(analysis_rsp)
response_dict = {"analysis": analysis_list}
await save_rsp_to_file(response_dict, response_file)
await plan_to_code.get_code(save_directory,score)
# format analysis file
final_path = await format_output(rsp_directory, response_file)
print("The results of the analysis have been saved at:", final_path)
return final_path

if __name__ == "__main__":
score = [0.8380,0.8324,0.8380] # Replace with the new score list
file_path = '/Users/aurora/Desktop/metaGPT_new/MetaGPT/data/output_1' # Replace with your folder path (the path where DI is automatically saved)
user_requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived.Make sure to generate at least 5 tasks each time, including eda, data preprocessing, feature engineering, model training to predict the target, and model evaluation. Please evaluate their performance separately, and finally input the optimal result. To predict the target, the Report accuracy on the eval data. Don't need to plot."
asyncio.run(main(score,file_path,user_requirement))
54 changes: 54 additions & 0 deletions examples/test_experiments/first_analysis_to_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import asyncio
import json
import random
import pathlib as Path

from metagpt.roles.di.data_interpreter import DataInterpreter
from metagpt.logs import logger
# from metagpt.utils.recovery_util import save_history
from examples.test_experiments import save_plan

async def load_json_data(json_dir):
with open(json_dir, "r") as file:
json_data = json.load(file)
return json_data

async def random_sample_tools(tools, num_samples):
return random.sample(tools, num_samples)

async def load_analysis (file_path,i):
rsp = await load_json_data(file_path)
data = str(rsp[i]['Analysis']['Analysis'])
return data

async def load_summarize(file_path):
with open(file_path,'r',encoding = 'utf-8') as file:
rsp = json.load(file)
return rsp

async def main(file_path: str,data_path:str,requirement:str = "",save_dir:str = ""):
#role = DataInterpreter(use_reflection=True, tools=["<all>"])
train_path = f"{data_path}/split_train.csv"
eval_path = f"{data_path}/split_eval.csv"
analysis_file = await load_json_data(file_path)
for i in range(0,len(analysis_file)):
role = DataInterpreter(use_reflection=True)
analysis = await load_analysis(file_path,i)
query = requirement+"Here are some insights derived from high-performance code:"+ str(analysis) + f"Train data path: '{train_path}', eval data path: '{eval_path}'."
print(query)
rsp = await role.run(query)
logger.info(rsp)
save_path = save_plan.save_history(role=role,save_dir = save_dir,round ='First')
print("The results of code is saved as:",save_path)
return save_path

if __name__ == "__main__":
dataset_name = "Titanic"
save_dir = f"/Users/aurora/Desktop/test_1/{dataset_name}/" #replace with your path
file_path = f"/Users/aurora/Desktop/test_1/{dataset_name}/Analysis/First_analysis_kaggle/response_data_format.json"
#Titanic
data_path = "/Users/aurora/Desktop/ml_benchmark/04_titanic"
user_requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived.Make sure to generate at least 5 tasks each time, including eda, data preprocessing, feature engineering, model training to predict the target, and model evaluation. Please evaluate their performance separately, and finally input the optimal result. To predict the target, the Report accuracy on the eval data. Don't need to plot."
asyncio.run(main(file_path,data_path,user_requirement,save_dir))


87 changes: 87 additions & 0 deletions examples/test_experiments/get_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import re
import json
import os

def extract_highest_scores(file_paths):
"""
Extract the highest scores from a list of JSON file paths.
Parameters:
file_paths (list of str): List of paths to JSON files.
Returns:
list of float: List of highest scores from each file.
"""
scores = []
num_files = len([name for name in os.listdir(file_paths) if name.endswith('{i:04d}.json')])
for i in range(1,num_files):
# Regular expression patterns for matching accuracy values
accuracy_pattern = re.compile(r'(\w+(?: \w+)*): (\d\.\d{4})')
best_model_pattern = re.compile(r'Best model on eval data: (\w+(?: \w+)*) with accuracy: (\d\.\d{4})')

for file_path in file_paths:
with open(file_path, 'r', encoding='utf-8') as file:
json_data = json.load(file)

# Extract the result field from the JSON data
result_text = json_data['task_map'][str(len(json_data["task_map"]))]["result"]

# Find all accuracies
accuracies = accuracy_pattern.findall(result_text)
best_model = best_model_pattern.search(result_text)

# Determine the highest score in the current file
current_highest = 0.0
for model, accuracy in accuracies:
accuracy_value = float(accuracy)
if accuracy_value > current_highest:
current_highest = accuracy_value

# Check the best model accuracy
if best_model:
best_model_accuracy = float(best_model.group(2))
if best_model_accuracy > current_highest:
current_highest = best_model_accuracy

# Append the highest score to the scores list
scores.append(current_highest)

return scores

def get_max_score_with_index(file_paths):
"""
Get the maximum score and its index from a list of JSON file paths.
Parameters:
file_paths (list of str): List of paths to JSON files.
Returns:
tuple: The highest score and its index in the file_paths list.
"""
highest_scores = extract_highest_scores(file_paths)
if highest_scores:
max_score = max(highest_scores)
max_index = highest_scores.index(max_score)
return max_score, max_index
else:
return None, None

def get_baseline_code_path(file_paths, code_base_path):
"""
Get the baseline code path based on the highest score.
Parameters:
file_paths (list of str): List of paths to JSON files.
code_base_path (str): Base directory path where code files are located.
Returns:
str: The path of the code file with the highest score.
"""
_, max_index = get_max_score_with_index(file_paths)
if max_index is not None:
# Construct the full path to the code file based on the highest score index
code_file_name = f"code_{max_index}.py"
baseline_code_path = f"{code_base_path}{code_file_name}"
return baseline_code_path
else:
return None
Loading

0 comments on commit aa89dc0

Please sign in to comment.