LAION-AI · monster29000 · Aug 17, 2024
@@ -12,6 +12,7 @@ def num_tokens_from_string(string: str) -> int:
 
 
 if __name__ == "__main__":
+    # Use the `dtype` parameter of `pd.read_csv`.
     sampled_df = pd.read_csv("wiki_qa_bart_10000row_input.csv")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(device)

@@ -3,6 +3,7 @@
 import pandas as pd
 
 if __name__ == "__main__":
+    # Use the `dtype` parameter of `pd.read_csv`.
     raw_df = pd.read_csv(r"...\wiki_qa_bart_10000row.csv")
     # print(raw_df.iloc[0])
     # print(raw_df.columns)

@@ -63,6 +63,7 @@ def main(output_dir: str = "data"):
     """Download and prepare the dataset for use."""
     os.makedirs(output_dir, exist_ok=True)
     kaggle.api.dataset_download_files("tboyle10/medicaltranscriptions", "data", unzip=True)
+    # Use the `dtype` parameter of `pd.read_csv`.
     mt_samples = preprocess(pd.read_csv("data/mtsamples.csv"))
     conversations = get_conversations(mt_samples)
     random.shuffle(conversations)

@@ -14,6 +14,7 @@
 
 # Read the CSV file into a pandas dataframe
 csv_file = os.path.join(download_path, "PoetryFoundationData.csv")
+# Use the `dtype` parameter of `pd.read_csv`.
 df = pd.read_csv(csv_file)
 
 # The data in the CSV file is not formatted correctly, so we need to clean it up.

@@ -43,6 +43,7 @@ def reformat_csv_to_openassistant(df: pd.DataFrame) -> pd.DataFrame:
 if __name__ == "__main__":
     input_csv = "zhihu.csv"
     # Create a pandas dataframe from your dataset file(s)
+    # Use the `dtype` parameter of `pd.read_csv`.
     df = pd.read_csv(input_csv)  # or any other way
     df = reformat_csv_to_openassistant(df)
     # Save the file in the Parquet format

@@ -458,6 +458,7 @@ def parse_arguments():
 
 
 def read_data(args):
+    # Use the `dtype` parameter of `pd.read_csv`.
     files = pd.read_csv(args.dataset, sep=",", header=None, names=["file"])
     files = files["file"].tolist()
     data = []