forked from geekan/MetaGPT
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #11 from Aurora-tq/test
- Loading branch information
Showing
10 changed files
with
946 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
## How to use? | ||
1.step1: run kaggle_to_analysis.py | ||
|
||
2.step2: run first_analysis_to_code.py | ||
|
||
3.step3: run code_to_analysis.py | ||
|
||
4.step4: run random_sample_analysis_with_baseline.py | ||
|
||
5.step5: run insights_to_code.py | ||
|
||
## The Result | ||
1.After the first step, you will obtain manual experience extracted from Kaggle. | ||
|
||
2.After the second step, you will utilize the experience from Kaggle to allow DI to generate the first round of 10 samples. | ||
|
||
3.After the third step, you will obtain the experience from the first round of samples. | ||
|
||
4.After the fourth step, you will obtain insights combined with the baseline code. | ||
|
||
5.After the fifth step, you will obtain new code samples generated based on insights from different task types, which will be the samples for the second round. | ||
|
||
## Note | ||
1.Need to modify the 'import' statements in the following file: ~/MetaGPT/metagpt/strategy/task_type.py | ||
|
||
from metagpt.prompts.task_type import... | ||
|
||
from metagpt.prompts.task_type_new import... | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
""" | ||
@Time : 2023/5/6 14:13 | ||
@Author : alexanderwu | ||
@File : llm_hello_world.py | ||
""" | ||
import asyncio | ||
import json | ||
import os | ||
import re | ||
import shutil | ||
from pathlib import Path | ||
|
||
from metagpt.llm import LLM | ||
from metagpt.logs import logger | ||
from metagpt.schema import Message | ||
from examples.test_experiments import plan_to_code | ||
|
||
STRUCTUAL_PROMPT = """ | ||
[User Requirement] | ||
{user_requirement} | ||
[code example] | ||
Here is an example of the code: | ||
{code} | ||
[score] | ||
The score of the example code is: {score} | ||
[instruction] | ||
**Summarize**: | ||
[ | ||
"Analyze the data preprocessing and feature selection methods used in the code. Focus on how these methods could potentially improve model performance.", | ||
"List the specific features processed and discuss their contribution to the model's effectiveness.", | ||
"Conclude with a brief rationale on the improvement in model performance due to these methods.", | ||
"Your analysis must be listed in bullet points, with a minimum of 3-5 points(e.g.,1.). Ensure that each point is concise and straightforward.", | ||
] | ||
Output a JSON in the following format: | ||
```json | ||
{{ | ||
"metric": "Report the value of the score.", | ||
"lower_is_better": "true if the metric should be minimized (e.g., MSE), false if it should be maximized (e.g., accuracy).", | ||
"Analysis": "Based on the [User Requirement], this concise analysis focuses on the key data processing and feature engineering methods outlined in the code, detailing their potential impact on improving model performance." | ||
}} | ||
""" | ||
|
||
SUMMARIZE_PROMPT = """ | ||
[analysis] | ||
{analysis} | ||
[instruction] | ||
**Summarize**: | ||
[ | ||
"From the detailed [analysis] of data preprocessing and feature engineering provided, extract and summarize specific, actionable recommendations that could optimize these processes.", | ||
"Include clear steps on how to implement these improvements and briefly describe their potential impact on model performance.", | ||
"Your summarzie must be listed in bullet points, with a minimum of 3-5 points(e.g.,step 1.). Ensure that each point is concise and straightforward.", | ||
] | ||
Output a JSON in the following format: | ||
```json | ||
{{ | ||
"insights": "Based on the detailed [analysis] provided above on code data preprocessing and feature engineering, please summarize specific and actionable recommendations." | ||
}} | ||
""" | ||
|
||
REFLECTION_SYSTEM_MSG = """You are an expert in machine learning. Please analyze the provided code and point out the valuable insights to consider.""" | ||
|
||
|
||
async def wrap_code(code: str, lang="python") -> str: | ||
"""Wraps code with three backticks.""" | ||
return f"```{lang}\n{code}\n```" | ||
|
||
async def save_rsp_to_file(rsp, filename): | ||
with open(filename, "w") as file: | ||
json.dump(rsp, file) | ||
|
||
async def load_json_data(json_dir): | ||
with open(json_dir, "r") as file: | ||
json_data = json.load(file) | ||
return json_data | ||
|
||
async def clean_json_from_rsp(text): | ||
pattern = r"```json(.*?)```" | ||
matches = re.findall(pattern, text, re.DOTALL) | ||
if matches: | ||
json = "\n".join(matches) # | ||
return json | ||
else: | ||
return "" | ||
|
||
async def format_output(response_dir, response_path): | ||
new_data = [] | ||
rsp_data = await load_json_data(response_path) | ||
for i, item in enumerate(rsp_data["analysis"]): | ||
item_dict = json.loads(item) | ||
data = {"Analysis": item_dict, "id": i} | ||
new_data.append(data) | ||
new_file_path = Path(response_dir) / "response_data_format.json" | ||
await save_rsp_to_file(new_data, new_file_path) | ||
return new_file_path | ||
|
||
async def summarize_insights(data): | ||
llm = LLM() | ||
analysis = data.get("analysis", []) | ||
structual_prompt = SUMMARIZE_PROMPT.format( | ||
analysis=analysis, | ||
) | ||
context = llm.format_msg([Message(content=structual_prompt, role="assistant")]) | ||
llm_response = await llm.aask(context, system_msgs=[REFLECTION_SYSTEM_MSG]) | ||
logger.info(llm_response) | ||
rsp = await clean_json_from_rsp(llm_response) | ||
return rsp | ||
|
||
async def analysis_code(data,requirement:str): | ||
llm = LLM() | ||
code = data.get("code", []) | ||
score = data.get("score", []) | ||
structual_prompt = STRUCTUAL_PROMPT.format( | ||
user_requirement=requirement, | ||
code=await wrap_code(code), | ||
score=score, | ||
) | ||
context = llm.format_msg([Message(content=structual_prompt, role="assistant")]) | ||
|
||
llm_response = await llm.aask(context, system_msgs=[REFLECTION_SYSTEM_MSG]) | ||
logger.info(llm_response) | ||
rsp = await clean_json_from_rsp(llm_response) | ||
return rsp | ||
|
||
|
||
async def main( | ||
score:list, | ||
first_save_path:str, | ||
requirement:str | ||
): | ||
dataset_name = "Titanic" | ||
path = f"/Users/aurora/Desktop/test_1/{dataset_name}" # Replace with your path | ||
# save_dir = first_save_path.parent | ||
# print(save_dir) | ||
new_plan_path = Path(path) / "First_sample/plan_DI" | ||
save_directory = Path(path) / "First_sample/code_DI_new/" | ||
await plan_to_code.process_code_from_plan(first_save_path, new_plan_path, score, save_directory) | ||
# analysis rsp path | ||
rsp_directory = Path(path) / "Analysis/Second_analysis_DI" | ||
os.makedirs(rsp_directory, exist_ok=True) | ||
response_file = Path(rsp_directory) / "response_data.json" | ||
# code->analysis | ||
analysis_list = [] | ||
for i in range(0, len(score)): | ||
analyze_code_dir = Path(save_directory) / f"code_{i}.json" | ||
analyze_data = await load_json_data(analyze_code_dir) | ||
analysis_rsp = await analysis_code(analyze_data,requirement) | ||
analysis_list.append(analysis_rsp) | ||
response_dict = {"analysis": analysis_list} | ||
await save_rsp_to_file(response_dict, response_file) | ||
await plan_to_code.get_code(save_directory,score) | ||
# format analysis file | ||
final_path = await format_output(rsp_directory, response_file) | ||
print("The results of the analysis have been saved at:", final_path) | ||
return final_path | ||
|
||
if __name__ == "__main__": | ||
score = [0.8380,0.8324,0.8380] # Replace with the new score list | ||
file_path = '/Users/aurora/Desktop/metaGPT_new/MetaGPT/data/output_1' # Replace with your folder path (the path where DI is automatically saved) | ||
user_requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived.Make sure to generate at least 5 tasks each time, including eda, data preprocessing, feature engineering, model training to predict the target, and model evaluation. Please evaluate their performance separately, and finally input the optimal result. To predict the target, the Report accuracy on the eval data. Don't need to plot." | ||
asyncio.run(main(score,file_path,user_requirement)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import asyncio | ||
import json | ||
import random | ||
import pathlib as Path | ||
|
||
from metagpt.roles.di.data_interpreter import DataInterpreter | ||
from metagpt.logs import logger | ||
# from metagpt.utils.recovery_util import save_history | ||
from examples.test_experiments import save_plan | ||
|
||
async def load_json_data(json_dir): | ||
with open(json_dir, "r") as file: | ||
json_data = json.load(file) | ||
return json_data | ||
|
||
async def random_sample_tools(tools, num_samples): | ||
return random.sample(tools, num_samples) | ||
|
||
async def load_analysis (file_path,i): | ||
rsp = await load_json_data(file_path) | ||
data = str(rsp[i]['Analysis']['Analysis']) | ||
return data | ||
|
||
async def load_summarize(file_path): | ||
with open(file_path,'r',encoding = 'utf-8') as file: | ||
rsp = json.load(file) | ||
return rsp | ||
|
||
async def main(file_path: str,data_path:str,requirement:str = "",save_dir:str = ""): | ||
#role = DataInterpreter(use_reflection=True, tools=["<all>"]) | ||
train_path = f"{data_path}/split_train.csv" | ||
eval_path = f"{data_path}/split_eval.csv" | ||
analysis_file = await load_json_data(file_path) | ||
for i in range(0,len(analysis_file)): | ||
role = DataInterpreter(use_reflection=True) | ||
analysis = await load_analysis(file_path,i) | ||
query = requirement+"Here are some insights derived from high-performance code:"+ str(analysis) + f"Train data path: '{train_path}', eval data path: '{eval_path}'." | ||
print(query) | ||
rsp = await role.run(query) | ||
logger.info(rsp) | ||
save_path = save_plan.save_history(role=role,save_dir = save_dir,round ='First') | ||
print("The results of code is saved as:",save_path) | ||
return save_path | ||
|
||
if __name__ == "__main__": | ||
dataset_name = "Titanic" | ||
save_dir = f"/Users/aurora/Desktop/test_1/{dataset_name}/" #replace with your path | ||
file_path = f"/Users/aurora/Desktop/test_1/{dataset_name}/Analysis/First_analysis_kaggle/response_data_format.json" | ||
#Titanic | ||
data_path = "/Users/aurora/Desktop/ml_benchmark/04_titanic" | ||
user_requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived.Make sure to generate at least 5 tasks each time, including eda, data preprocessing, feature engineering, model training to predict the target, and model evaluation. Please evaluate their performance separately, and finally input the optimal result. To predict the target, the Report accuracy on the eval data. Don't need to plot." | ||
asyncio.run(main(file_path,data_path,user_requirement,save_dir)) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import re | ||
import json | ||
import os | ||
|
||
def extract_highest_scores(file_paths): | ||
""" | ||
Extract the highest scores from a list of JSON file paths. | ||
Parameters: | ||
file_paths (list of str): List of paths to JSON files. | ||
Returns: | ||
list of float: List of highest scores from each file. | ||
""" | ||
scores = [] | ||
num_files = len([name for name in os.listdir(file_paths) if name.endswith('{i:04d}.json')]) | ||
for i in range(1,num_files): | ||
# Regular expression patterns for matching accuracy values | ||
accuracy_pattern = re.compile(r'(\w+(?: \w+)*): (\d\.\d{4})') | ||
best_model_pattern = re.compile(r'Best model on eval data: (\w+(?: \w+)*) with accuracy: (\d\.\d{4})') | ||
|
||
for file_path in file_paths: | ||
with open(file_path, 'r', encoding='utf-8') as file: | ||
json_data = json.load(file) | ||
|
||
# Extract the result field from the JSON data | ||
result_text = json_data['task_map'][str(len(json_data["task_map"]))]["result"] | ||
|
||
# Find all accuracies | ||
accuracies = accuracy_pattern.findall(result_text) | ||
best_model = best_model_pattern.search(result_text) | ||
|
||
# Determine the highest score in the current file | ||
current_highest = 0.0 | ||
for model, accuracy in accuracies: | ||
accuracy_value = float(accuracy) | ||
if accuracy_value > current_highest: | ||
current_highest = accuracy_value | ||
|
||
# Check the best model accuracy | ||
if best_model: | ||
best_model_accuracy = float(best_model.group(2)) | ||
if best_model_accuracy > current_highest: | ||
current_highest = best_model_accuracy | ||
|
||
# Append the highest score to the scores list | ||
scores.append(current_highest) | ||
|
||
return scores | ||
|
||
def get_max_score_with_index(file_paths): | ||
""" | ||
Get the maximum score and its index from a list of JSON file paths. | ||
Parameters: | ||
file_paths (list of str): List of paths to JSON files. | ||
Returns: | ||
tuple: The highest score and its index in the file_paths list. | ||
""" | ||
highest_scores = extract_highest_scores(file_paths) | ||
if highest_scores: | ||
max_score = max(highest_scores) | ||
max_index = highest_scores.index(max_score) | ||
return max_score, max_index | ||
else: | ||
return None, None | ||
|
||
def get_baseline_code_path(file_paths, code_base_path): | ||
""" | ||
Get the baseline code path based on the highest score. | ||
Parameters: | ||
file_paths (list of str): List of paths to JSON files. | ||
code_base_path (str): Base directory path where code files are located. | ||
Returns: | ||
str: The path of the code file with the highest score. | ||
""" | ||
_, max_index = get_max_score_with_index(file_paths) | ||
if max_index is not None: | ||
# Construct the full path to the code file based on the highest score index | ||
code_file_name = f"code_{max_index}.py" | ||
baseline_code_path = f"{code_base_path}{code_file_name}" | ||
return baseline_code_path | ||
else: | ||
return None |
Oops, something went wrong.