data_synthesis.py

from utils.openai_utils import OpenAIClient
import datetime 
import json 
import os 
from rouge_score import rouge_scorer
from collections import defaultdict, OrderedDict 
from tqdm import tqdm 
import numpy as np 
from functools import partial 
from metadata import (
    SEED,
    CELL_LABEL, 
    SEQUENCING_METHOD, 
    TISSUE,
    SPECIES,
    DRUG,
    CHOICES, 
) 
from utils import str2bool 
from typing import (
    List, 
    Iterable, 
    Optional, 
    Callable, 
)
import warnings 
import argparse


INSTRUCTIONS = {
    "world_knowledge": "You are a helpful AI assistant tasked with considering " + 
        "and listing various potential scenarios.",
    "instruction_generation": "You are a helpful AI assistant. Your task now is to generate a single round conversation " + 
        "about {task_name}. Adopt the perspective of the questioner and consider how they would express themselves. {hint}\n\n",
    "response_generation": "You are now engaged in a single-round conversation about single-cell analysis. " + 
        "You are asked to do {task_name} (see Utterance of Questioner). ",
    "rephrasing": "Humans can express the same meaning in different ways during communication. " + 
        "You are very imaginative. Please rewrite the following sentence " + 
        "so that its meaning remains unchanged but the expression including logic, attitude, style or other attributes " + 
        "of sentences changes significantly.\n\n", 
}
REQUIREMENTS = {
    "world_knowledge": "The output should be formatted as follows:\n1. x1\n2. x2\n3. x3\n...\nn. xn\n\n"  + 
        "where xi represents the i-th {obj}.\nEnsure that n is greater than {lower_bound}. The more, the better.\n",
    "instruction_generation": "Consider the relevant information of questioner and task, " + 
        "then give the corresponding statement from the questioner. The format for the generated answer is:\n" + 
        "<result>[text]</result>\n" + "Replace \"[text]\" with your answer. " +  
        "Fragments enclosed in curly braces are not placeholders; kindly refrain from replacing them.\n\n" + 
        "Note that before give the final answer, you should give your thinking process.\n\n" +
        "Try to think yourself into the role!\n", 
    "response_generation": "Please give your response. The format for the generated response is:\n" + 
        "<response>[text]</response>\n" + "Replace \"[text]\" with your answer. " + 
        "Fragments enclosed in curly braces are not placeholders; kindly refrain from replacing them.",
    "rephrasing": "Consider the relevant information of questioner and task, " + 
        "then give the corresponding statement from the questioner. The format for the generated answer is:\n" + 
        "<result>[text]</result>\n" + "Replace \"[text]\" with your answer. " +  
        "Fragments enclosed in curly braces are not placeholders; kindly refrain from replacing them.\n\n" + 
        "Note that before give the final answer, you should give your thinking process.\n",  
}


ATTRIBUTES = {
    "Cell type": f"{{{CELL_LABEL}}}", 
    "Sequencing method": f"{{{SEQUENCING_METHOD}}}",
    "Tissue": f"{{{TISSUE}}}",
    "Species": f"{{{SPECIES}}}", 
    "Drug": f"{{{DRUG}}}",
    "Options": f"{{{CHOICES}}}", 
    "Single cell to be annotated": "{input}", 
    "Single cell whose response to a drug is to be predicted": "{input}",
}
TASK_META_DATA = {
    "conditional pseudo cell generation": {
        "attributes": [
            "Cell type", 
            "Sequencing method",
            "Tissue",
            "Species", 
        ],
        "optional_attributes": [
        ], 
        "expected_result": "Another person being asked generates a single-cell gene profile whose biological characteristics " + 
            "are in line with the task's requirement is generated.", 
        "setting": "When an individual asks an AI assistant to help him generate single-cell data based on scRNA-seq, " +
            "identify the underlying motivations.", 
        "response_condition": "You already generated a cell by yourself, which is represented by \"{output}\".", 
        "response_constraint": "1. {output} must be at the end of your response.\n" + 
            "2. The word count must not exceed 100.\n\n", 
        "hint": "Note that the questioner only cares about the cell generated by another person in this dialog.", 
    }, 
    "cell type annotation": {
        "attributes": [
            "Single cell to be annotated", 
        ],
        "optional_attributes": [
            "Sequencing method",
            "Tissue",
            "Species",  
            "Options", 
        ],
        "expected_result": "Another person being asked provides the questioner with the correct cell type label of given single cell {input}.", 
        "setting": "When an individual asks an AI assistant to help him annotate the cell type of a single cell, " + 
            "identify the underlying motivations.", 
        "response_condition": "You already assigned the cell with the correct cell type label by yourself, which is represented by \"{output}\".", 
        "response_constraint": "1. Cannot contain {input}.\n" + 
            "2. The word count must not exceed 100.\n\n",
        "hint": "", 
    }, 
    "drug sensitivity prediction": {
        "attributes": [ 
            "Drug",
            "Single cell whose response to a drug is to be predicted", 
        ],
        "optional_attributes": [
            "Sequencing method",
            "Tissue",
            "Species",
            "Options",   
        ],
        "expected_result": "Another person being asked predict the response of single cell {input} to the drug {drug} given by the questioner.",  
        "setting": "When an individual asks an AI assistant to help him predict the sensitivity of a single cell for a specific drug, " + 
            "identify the underlying motivations.", 
        "response_condition": "You already assigned the cell with the correct drug response label by yourself, which is represented by \"{output}\".", 
        "response_constraint": "1. Cannot contain {input}.\n" + 
            "2. The word count must not exceed 100.\n\n",
        "hint": "", 
    }, 
}

def check_last_token(text: str, last_token: str) -> bool:

    return text.endswith(last_token)

def check_target_words(text: str, words: Iterable[str]) -> bool:

    return all(word in text for word in words)

def check_text_length(text: str, max_length: int, min_length: int) -> bool:

    return min_length <= len(text.split()) <= max_length

def check_banned_words(text: str, banned_words: Iterable[str]) -> bool:
    
    return all(banned_word not in text for banned_word in banned_words)


def post_process_world_knowledge(text: str) -> Optional[List[str]]:

    try: 
        return list(set([e.split('.')[-1].strip().strip('\n') for e in text.split("\n")]))
    except Exception as e:
        print(f"Error during postprocessing: {e}")
        return None
    

def extract_content(
    text: str, 
    start_tag: str, 
    end_tag: str, 
    filters: Optional[Iterable[Callable[[str], bool]]] = None,
) -> Optional[str]:

    if start_tag == end_tag:
        raise ValueError("start_tag and end_tag should be different")
    
    start_index = text.find(start_tag)
    if start_index == -1:
        return None 
    end_index = text.rfind(end_tag)
    if end_index == -1 or end_index < start_index:
        return None 
    res = text[start_index + len(start_tag): end_index]
    if start_tag in res or end_tag in res:
        return None
    
    if filters is not None:
        for f in filters:
            if not f(res):
                return None
    return res  

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # parameters for the OpenAI API
    parser.add_argument("--api_key", type=str, required=True, help="The api key for the OpenAI API.")
    parser.add_argument("--base_url", type=str, required=True, help="The base url for the OpenAI API.")
    parser.add_argument("--output_dir", required=True, type=str, help="The output directory.")
    parser.add_argument(
        "--timeout", 
        default=60, 
        type=int, 
        help="The timeout for the OpenAI API. If the request takes longer than this value, it will be terminated."
    )
    parser.add_argument(
        "--max_retries", 
        default=2, 
        type=int, 
        help="The maximum number of retries for the OpenAI API."
    )
    parser.add_argument("--model", default='gpt-4o', type=str, help="The model for the OpenAI API.")
    parser.add_argument(
        "--stream", 
        default=True, 
        type=str2bool,  
        help="Whether to stream the response from the OpenAI API."
    )
    parser.add_argument(
        "--temperature", 
        default=0.8, 
        type=float, 
        help="The temperature for the OpenAI API. If the value is high, the model will be more creative."
    )
    parser.add_argument(
        "--top_p", 
        default=0.95, 
        type=float, 
        help="The top_p for the OpenAI API. If the value is high, the model will be more creative."
    )

    # parameters for data synthesis
    parser.add_argument(
        "--max_tolerance", 
        default=3, 
        type=int, 
        help="The maximum tolerance for the OpenAI API. If the postprocessing function fails to get the final response, " +
            "API will be called again. The maximum number of callbacks is max_tolerance."
    )
    parser.add_argument(
        "--num_templates_for_task", 
        default=360, 
        type=int, 
        help="The number of generated templates for each task."
    )
    parser.add_argument(
        "--seed", 
        default=SEED, 
        type=int, 
        help="The seed used to control the randomness of program."
    )
    parser.add_argument(
        "--no_questioner_metadata_ratio", 
        default=0.1, 
        type=float, 
        help="The ratio of no questioner metadata. For example, if the value is 0.1, " +
            "then 10% of prompts used to generate input templates will not contain questioner metadata."
    )
    parser.add_argument(
        "--optional_attributes", 
        default=None, 
        type=str, 
        nargs='+',
        help="The optional attributes for the task."
    )
    parser.add_argument(
        "--questioner_comb_probs", 
        default=[0.2, 0.5, 0.3], 
        type=float, 
        nargs='+',
        help="The combination probabilities of questioner metadata."
    )
    parser.add_argument(
        "--questioner_factors", 
        default=['Motivation', 'Personality', 'Level of proficiency in single-cell analysis'], 
        type=str,
        nargs='+',
        help="The factors of questioner metadata."
    )
    parser.add_argument(
        "--knowledge_dir", 
        default="../world_knowledge", 
        type=str, 
        help="The directory for personalities and motivations."
    )
    parser.add_argument(
        "--previous_inst_dir", 
        default=None, 
        type=str, 
        help="The directory for previous instructions."
    )
    parser.add_argument(
        "--similarity_threshold", 
        default=0.75, 
        type=float, 
        help="The similarity threshold for the templates. If current template is similar to previous templates, " +
            "it will be refined."
    )
    parser.add_argument(
        "--num_refinement_steps", 
        default=3, 
        type=int, 
        help="The number of refinement steps. If the similarity between current template and previous templates is " +
            "greater than the similarity threshold, the current template will be refined. " + 
            "If the refined template is still similar to previous templates, it will be refined again. " +
            "The maximum number of refinement steps is num_refinement_steps."
    )
    parser.add_argument(
        "--debug", 
        default=True, 
        type=str2bool, 
        help="Whether to enable the debug mode."
    )
    parser.add_argument(
        "--options_ratio", 
        default=0.5, 
        type=float, 
        help="The ratio of input templates of classification tasks containing available options."
    )

    parser.add_argument(
        "--only_synthesize_world_knowledge", 
        default=False, 
        type=str2bool,
        help="whehter to synthesize the world knowledge only." 
    )

    # parameters of filtering 
    parser.add_argument(
        "--max_length", 
        default=70, 
        type=int, 
        help="The maximum length of templates. If the length of templates is greater than this value, " +
            "it will be filtered out."
    )
    args = parser.parse_args()

    total_cost = 0.0 
    total_time = 0.0 
    total_refinement_steps = 0 
    client = OpenAIClient(
        api_key=args.api_key, 
        base_url=args.base_url, 
        timeout=args.timeout,
        max_retries=args.max_retries,  
    )
    task_names = list(TASK_META_DATA.keys())
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    stream = args.stream
    model = args.model
    temperature = args.temperature
    top_p = args.top_p
    max_tolerance = args.max_tolerance

    if len(args.knowledge_dir) == 0:
        # word_knowledge mode is enabled
        objects = ["motivation", "personality"]
        prompt = INSTRUCTIONS["world_knowledge"]
        for obj in objects:
            if obj == 'motivation':
                examples = "# Examples\n1. Unwilling to do single cell analysis\n2. To learn the knowledge of single cell analysis"
                prompts = [f'{prompt} {TASK_META_DATA[task_name]["setting"]}\n\n{examples}\n\n' for task_name in task_names]
                outputs = {} 
            else:
                prompts = [f'{prompt} Please list personalities that effect the utterance of speaker.\n\n']
                outputs = [] 
            requirement = REQUIREMENTS["world_knowledge"].format(obj=obj, lower_bound=60 if obj != 'motivation' else 30)
            for i, current_prompt in enumerate(prompts):
                current_prompt_ = current_prompt + requirement
                messages = [{"role": "user", "content": current_prompt_}]
                response_dict = client.get_text_generation_output(
                    messages,
                    post_processor=post_process_world_knowledge,
                    max_tolerance=max_tolerance,
                    model=model, 
                    stream=stream, 
                    temperature=temperature, 
                    top_p=top_p, 
                )
                total_cost += response_dict["cost"] 
                total_time += response_dict["time"]
                situations = response_dict["processed_content"]
                if situations is None:
                    warnings.warn(
                        "Postprocessing function fails to get the final response. " + \
                        f"The raw content: {response_dict['content']}", 
                        UserWarning 
                    ) 
                    situations = [] 
                if obj == 'motivation':
                    outputs[task_names[i]] = situations
                else:
                    # for personality
                    # filter out the empty string and the string with more than 5 words 
                    for output in situations:
                        if len(output) == 0:
                            continue 
                        if len(output.split(' ')) > 5:
                            continue 
                        outputs.append(output)
            output_file = os.path.join(args.output_dir, f"{obj}.json")
            with open(output_file, 'w') as f:
                json.dump(outputs, f, indent=4)
        args.knowledge_dir = args.output_dir 

    if not args.only_synthesize_world_knowledge:
        previous_templates = {} 
        input_file = os.path.join(args.knowledge_dir, "motivation.json")
        with open(input_file, 'r') as f:
            motivations = json.load(f)
        input_file = os.path.join(args.knowledge_dir, "personality.json")
        with open(input_file, 'r') as f:
            personalities = json.load(f)
        proficiency = ["high", "low, layman, no need for explanation"]

        input_file = None 
        if args.previous_inst_dir is not None:
            input_file = os.path.join(args.previous_inst_dir, "raw_templates.json")
        if input_file is not None and os.path.exists(input_file):
            with open(input_file, 'r') as f:
                raw_templates = json.load(f)
            for key, value in raw_templates.items():
                previous_templates[key] = value
        if args.optional_attributes is None:
            args.optional_attributes = ["Sequencing method", "Tissue", "Species"]
        args.optional_attributes.append("Options")

        if len(args.questioner_factors) != len(args.questioner_comb_probs):
            raise ValueError("The length of questioner_factors and questioner_comb_probs should be the same.")
        if sum(args.questioner_comb_probs) != 1.0:
            raise ValueError("The sum of questioner_comb_probs should be 1.0.")
        available_key = {
            "Motivation", 
            "Personality",
            "Level of proficiency in single-cell analysis",
        }
        for questioner_factor in args.questioner_factors:
            if questioner_factor not in available_key:
                raise ValueError(f"{questioner_factor} is not a valid key for questioner_factors.")
        if len(args.questioner_factors) == 0 and args.no_questioner_metadata_ratio != 1.0:
            warnings.warn(
                "The ratio of no questioner metadata is not 1.0, but the questioner_factors is empty. " + 
                "It seems that the questioner_factors is not set correctly. " + 
                "The ratio of no questioner metadata is set to 1.0.",
                UserWarning
            )
            args.no_questioner_metadata_ratio = 1.0

        # start to synthesize data 
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
        inst_gen_prompt = INSTRUCTIONS["instruction_generation"]
        inst_req_prompt = REQUIREMENTS["instruction_generation"]
        resp_gen_prompt = INSTRUCTIONS["response_generation"]
        resp_req_prompt = REQUIREMENTS["response_generation"]
        generator = np.random.default_rng(args.seed)


        progress_bar = tqdm(
            total=len(task_names) * args.num_templates_for_task, 
            desc=f"Generating templates, target directory: {args.output_dir}"
        )
        progress_bar.update(sum(len(previous_templates.get(task_name, {"instruction": []})["instruction"]) for task_name in task_names))
        for task_name in task_names:
            if task_name not in previous_templates:
                previous_templates[task_name] = defaultdict(list)
            task_metadata = OrderedDict()
            for attr in TASK_META_DATA[task_name]["attributes"]:
                task_metadata[attr] = ATTRIBUTES[attr] 
            hint = TASK_META_DATA[task_name]["hint"]
            prompt = inst_gen_prompt.format(task_name=task_name, hint=hint) 
            optional_task_attr = set(TASK_META_DATA[task_name]["optional_attributes"])
            for attr in args.optional_attributes:
                if attr in optional_task_attr:
                    task_metadata[attr] = ATTRIBUTES[attr]
            expected_result = TASK_META_DATA[task_name]["expected_result"]
            task_motivations = motivations[task_name]
            # used to generate the response
            response_condition = TASK_META_DATA[task_name]["response_condition"]
            response_constraint = TASK_META_DATA[task_name]["response_constraint"]
            response_prompt = resp_gen_prompt.format(task_name=task_name)
            # to generate new instruction and response templates
            for _ in range(len(previous_templates[task_name]), args.num_templates_for_task):
                slots = list(task_metadata.keys())
                words = list(task_metadata.values())
                if "Options" in task_metadata and generator.random() > args.options_ratio:
                    slots = slots[:-1]
                    words = words[:-1]
                inst_extract_func = partial(
                    extract_content, 
                    start_tag="<result>", 
                    end_tag="</result>", 
                    filters=[
                        partial(check_text_length, max_length=args.max_length, min_length=1), 
                        partial(check_target_words, words=words), 
                    ],
                )
                resp_extract_func = partial(
                    extract_content, 
                    start_tag="<response>", 
                    end_tag="</response>", 
                    filters=[
                        partial(check_target_words, words=["{output}"]), 
                        partial(check_text_length, max_length=args.max_length, min_length=1)
                    ] if task_name != "conditional pseudo cell generation" else [
                        partial(check_last_token, last_token="{output}"),
                        partial(check_text_length, max_length=args.max_length, min_length=1), 
                        partial(check_target_words, words=["{output}"]), 
                        partial(
                            check_banned_words, 
                            banned_words=[
                                "geneA", 
                                "gene A", 
                                "GeneA", 
                                "Gene A", 
                                "gene 1", 
                                "gene1",
                                "Gene1",
                                "GENE1",
                                "Gene 1", 
                                "GENE 1", 
                            ],
                        )
                    ], 
                )
                # first generate the instruction templates 
                current_prompt = prompt
                if generator.random() > args.no_questioner_metadata_ratio:
                    # add questioner metadata
                    current_prompt = f"{current_prompt}# Relevant information about the questioner:\n"
                    num_questioner_var = generator.choice(list(range(len(args.questioner_factors))), p=args.questioner_comb_probs)
                    selected_var = generator.choice(
                        args.questioner_factors, 
                        size=num_questioner_var, 
                        replace=False
                    )
                    for key_var in selected_var:
                        if key_var == 'Motivation':
                            options = task_motivations
                        elif key_var == 'Personality':
                            options = personalities
                        else:
                            options = proficiency
                        current_prompt = f"{current_prompt}{key_var}: {generator.choice(options)}\n"
                    current_prompt = f"{current_prompt}\n"
                # add task-related information
                slots_ = generator.permutation(slots)
                slot_value_pair = '\n'.join([f"{slot}: {task_metadata[slot]}" for slot in slots_])
                current_prompt = f"{current_prompt}# Task-Related Information:\n{slot_value_pair}\n\n"
                if slots[-1] != "Options":
                    current_prompt = f"{current_prompt}# Expected Response of Task:\n{expected_result}\n\n{inst_req_prompt}"
                else:
                    current_prompt = f"{current_prompt}# Expected Response of Task:\n{expected_result} " + \
                        f"The prediction is one of the options provided by the questioner.\n\n{inst_req_prompt}"
                messages = [{"role": "user", "content": current_prompt}]
                response_dict = client.get_text_generation_output(
                    messages,
                    post_processor=inst_extract_func,
                    max_tolerance=max_tolerance,
                    model=model, 
                    stream=stream, 
                    temperature=temperature, 
                    top_p=top_p, 
                )
                total_cost += response_dict["cost"] 
                total_time += response_dict["time"]
                new_instruction = response_dict["processed_content"]
                if new_instruction is None:
                    warnings.warn(
                        "Postprocessing function fails to get the final response. " + \
                        f"The raw content: {response_dict['content']}", 
                        UserWarning 
                    ) 
                    progress_bar.update(1)
                    continue 
                scorer_func = np.vectorize(
                    lambda prediction: scorer.score(
                        new_instruction, 
                        prediction
                    )['rougeL'].fmeasure
                )
                if len(previous_templates[task_name]["instruction"]) > 0:
                    scores = scorer_func(previous_templates[task_name]["instruction"])
                else:
                    scores = np.array([0.0])
                max_score = np.max(scores)
                current_refinement_step = 0 
                # refine the input instruction to make it more diverse
                while max_score > args.similarity_threshold and current_refinement_step < args.num_refinement_steps:
                    current_refinement_step += 1
                    total_refinement_steps += 1 
                    inst_rephasing_prompt, rephasing_req_prompt = INSTRUCTIONS["rephrasing"], REQUIREMENTS["rephrasing"]
                    rephasing_prompt = f"{inst_rephasing_prompt}# Original Sentence:\n{new_instruction}\n\n{rephasing_req_prompt}"
                    rephasing_messages = [{"role": "user", "content": rephasing_prompt}]
                    response_dict = client.get_text_generation_output(
                        rephasing_messages,
                        post_processor=inst_extract_func,
                        max_tolerance=max_tolerance,
                        model=model, 
                        stream=stream, 
                        temperature=temperature, 
                        top_p=top_p, 
                    )
                    total_cost += response_dict["cost"] 
                    total_time += response_dict["time"]
                    if response_dict["processed_content"] is not None:
                        new_instruction = response_dict["processed_content"]
                        scores = scorer_func(previous_templates[task_name]["instruction"])
                        max_score = np.max(scores)
                if max_score <= args.similarity_threshold:
                    # then generate the response templates   
                    current_prompt = response_prompt 
                    current_prompt = f"{current_prompt}{response_condition} Therefore, you are organizing your response now.\n\n"
                    current_prompt = f"{current_prompt}# Utterance of Questioner:\n{new_instruction}\n\n"
                    if slots[-1] != "Options":
                        current_prompt = f"{current_prompt}# Constraint of Response:\n{response_constraint}\n\n"
                    else:
                        current_prompt = f"{current_prompt}# Constraint of Response:\n{response_constraint}\n" + \
                            "3. Cannot contain {choices}.\n\n"
                    current_prompt = f"{current_prompt}{resp_req_prompt}"
                    messages = [{"role": "user", "content": current_prompt}]
                    response_dict = client.get_text_generation_output(
                        messages,
                        post_processor=resp_extract_func,
                        max_tolerance=max_tolerance,
                        model=model, 
                        stream=stream, 
                        temperature=temperature, 
                        top_p=top_p, 
                    )
                    total_cost += response_dict["cost"] 
                    total_time += response_dict["time"]
                    if response_dict["processed_content"] is not None:
                        new_response = response_dict["processed_content"]
                        scorer_func = np.vectorize(
                            lambda prediction: scorer.score(
                                new_response, 
                                prediction
                            )['rougeL'].fmeasure
                        )
                        if len(previous_templates[task_name]["response"]) > 0:
                            response_scores = scorer_func(previous_templates[task_name]["response"])
                        else:
                            response_scores = np.array([0.0])
                        response_max_score = np.max(response_scores)
                        for key, value in zip(
                            ["instruction", "response", "instruction_score", "response_score"], 
                            [new_instruction, new_response, max_score, response_max_score]
                        ):
                            previous_templates[task_name][key].append(value)
                        if args.debug:
                            assert "instruction_most_similar_candidates" in previous_templates[task_name] or len(previous_templates[task_name]["instruction_most_similar_candidates"]) == 0, \
                                "It seems that generation of previous templates are not in debugging mode."
                            assert "response_most_similar_candidates" in previous_templates[task_name] or len(previous_templates[task_name]["response_most_similar_candidates"]) == 0, \
                                "It seems that generation of previous templates are not in debugging mode."
                            previous_templates[task_name]["instruction_most_similar_candidates"].append(
                                [previous_templates[task_name]["instruction"][i] for i in np.argsort(scores)[-3: ]] if len(previous_templates[task_name]["instruction"]) > 1 else [] 
                            )
                            previous_templates[task_name]["response_most_similar_candidates"].append(
                                [previous_templates[task_name]["response"][i] for i in np.argsort(response_scores)[-3: ]] if len(previous_templates[task_name]["response"]) > 1 else []
                            )
                progress_bar.update(1)
        progress_bar.close()

        print("Total Cost: {:.2f}$".format(total_cost))
        print(f"Total Time: {str(datetime.timedelta(seconds=total_time))}")
        print(f"Total Refinement Steps: {total_refinement_steps}")

        output_file = os.path.join(args.output_dir, "raw_templates.json")
        with open(output_file, 'w') as f:
            json.dump(previous_templates, f, indent=4)