Feat: layout for cuda kernels

gpu-mode · Nov 9, 2024 · e474abe · e474abe
1 parent 668e4ea
commit e474abe
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 58 deletions.
diff --git a/.github/workflows/train_workflow.yml b/.github/workflows/train_workflow.yml
@@ -3,28 +3,58 @@ on:
   workflow_dispatch:
     inputs:
       script_content:
-        description: 'Content of train.py'
+        description: 'Content of training script (Python or CUDA)'
         required: true
-        type: string  # Explicitly specify the type
+        type: string
+      script_type:
+        description: 'Script type (py or cu)'
+        required: true
+        type: choice
+        options:
+          - py
+          - cu
 
 jobs:
   train:
     runs-on: ubuntu-latest
     steps:
-      - name: Install dependencies
+      - name: Set up environment variables
+        run: |
+          echo "SCRIPT_FILE=train.${{ inputs.script_type }}" >> $GITHUB_ENV
+          echo "OUTPUT_FILE=training.log" >> $GITHUB_ENV
+
+      - name: Install Python dependencies
+        if: inputs.script_type == 'py'
         run: |
           pip install numpy
-          # pip install torch - need to find a way to cache this otherwise it will take a long time to install
+          # Add other Python dependencies as needed
+
+      - name: Install CUDA dependencies
+        if: inputs.script_type == 'cu'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y nvidia-cuda-toolkit
+          nvcc --version
 
-      - name: Create and run training script
+      - name: Create training script
         run: |
-          echo "${{ inputs.script_content }}" > train.py
-          cat train.py  # Debug: print the content
-          python train.py > training.log 2>&1
-        
+          echo "${{ inputs.script_content }}" > ${{ env.SCRIPT_FILE }}
+          cat ${{ env.SCRIPT_FILE }}  # Debug: print the content
+
+      - name: Compile and run CUDA script
+        if: inputs.script_type == 'cu'
+        run: |
+          nvcc ${{ env.SCRIPT_FILE }} -o train_cuda
+          ./train_cuda > ${{ env.OUTPUT_FILE }} 2>&1
+
+      - name: Run Python script
+        if: inputs.script_type == 'py'
+        run: |
+          python ${{ env.SCRIPT_FILE }} > ${{ env.OUTPUT_FILE }} 2>&1
+
       - name: Upload logs
         uses: actions/upload-artifact@v3
-        if: always()  # Upload logs whether the job succeeds or fails
+        if: always()
         with:
           name: training-logs
-          path: training.log
+          path: ${{ env.OUTPUT_FILE }}
diff --git a/discord-bot.py b/discord-bot.py
@@ -172,6 +172,56 @@ async def on_ready():
         except Exception as e:
             logger.warning(f'Failed to update nickname in guild {guild.name}: {e}')
 
+
+async def process_python(message, attachment):
+    # Reply to the original message
+    initial_reply = await message.reply("Found train.py! Starting training process...")
+
+    # Create a new thread from the reply
+    thread = await initial_reply.create_thread(
+        name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
+        auto_archive_duration=1440  # Archive after 24 hours of inactivity
+    )
+
+    try:
+        # Download the file content
+        logger.info("Downloading train.py content")
+        script_content = await attachment.read()
+        script_content = script_content.decode('utf-8')
+        logger.info("Successfully read train.py content")
+
+        # Trigger GitHub Action
+        run_id = await trigger_github_action(script_content)
+
+        if run_id:
+            logger.info(f"Successfully triggered workflow with run ID: {run_id}")
+            await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")
+
+            # Monitor the workflow
+            status, logs, url = await check_workflow_status(run_id, thread)
+
+            # Send results back to Discord thread
+            await thread.send(f"Training completed with status: {status}")
+
+            # Split logs if they're too long for Discord's message limit
+            if len(logs) > 1900:
+                chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
+                for i, chunk in enumerate(chunks):
+                    await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
+            else:
+                await thread.send(f"```\nLogs:\n{logs}\n```")
+
+            if url:
+                await thread.send(f"View the full run at: {url}")
+        else:
+            logger.error("Failed to trigger GitHub Action")
+            await thread.send("Failed to trigger GitHub Action. Please check the configuration.")
+
+    except Exception as e:
+        logger.error(f"Error processing request: {str(e)}", exc_info=True)
+        await thread.send(f"Error processing request: {str(e)}")
+
+
 @client.event
 async def on_message(message):
     # Ignore messages from the bot itself
@@ -185,54 +235,10 @@ async def on_message(message):
             for attachment in message.attachments:
                 logger.info(f"Processing attachment: {attachment.filename}")
                 if attachment.filename == "train.py":
-                    # Reply to the original message
-                    initial_reply = await message.reply("Found train.py! Starting training process...")
-
-                    # Create a new thread from the reply
-                    thread = await initial_reply.create_thread(
-                        name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
-                        auto_archive_duration=1440  # Archive after 24 hours of inactivity
-                    )
-
-                    try:
-                        # Download the file content
-                        logger.info("Downloading train.py content")
-                        script_content = await attachment.read()
-                        script_content = script_content.decode('utf-8')
-                        logger.info("Successfully read train.py content")
-
-                        # Trigger GitHub Action
-                        run_id = await trigger_github_action(script_content)
-
-                        if run_id:
-                            logger.info(f"Successfully triggered workflow with run ID: {run_id}")
-                            await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")
-
-                            # Monitor the workflow
-                            status, logs, url = await check_workflow_status(run_id, thread)
-
-                            # Send results back to Discord thread
-                            await thread.send(f"Training completed with status: {status}")
-
-                            # Split logs if they're too long for Discord's message limit
-                            if len(logs) > 1900:
-                                chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
-                                for i, chunk in enumerate(chunks):
-                                    await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
-                            else:
-                                await thread.send(f"```\nLogs:\n{logs}\n```")
-
-                            if url:
-                                await thread.send(f"View the full run at: {url}")
-                        else:
-                            logger.error("Failed to trigger GitHub Action")
-                            await thread.send("Failed to trigger GitHub Action. Please check the configuration.")
-
-                    except Exception as e:
-                        logger.error(f"Error processing request: {str(e)}", exc_info=True)
-                        await thread.send(f"Error processing request: {str(e)}")
-
+                    process_python(message, attachment)
                     break
+                elif attachment.filename == "train.cu":
+                    raise ValueError("CUDA training is not supported yet")
 
             if not any(att.filename == "train.py" for att in message.attachments):
                 await message.reply("Please attach a file named 'train.py' to your message.")