Skip to content

Commit

Permalink
Feat: layout for cuda kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
S1ro1 committed Nov 9, 2024
1 parent 668e4ea commit e474abe
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 58 deletions.
52 changes: 41 additions & 11 deletions .github/workflows/train_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,58 @@ on:
workflow_dispatch:
inputs:
script_content:
description: 'Content of train.py'
description: 'Content of training script (Python or CUDA)'
required: true
type: string # Explicitly specify the type
type: string
script_type:
description: 'Script type (py or cu)'
required: true
type: choice
options:
- py
- cu

jobs:
train:
runs-on: ubuntu-latest
steps:
- name: Install dependencies
- name: Set up environment variables
run: |
echo "SCRIPT_FILE=train.${{ inputs.script_type }}" >> $GITHUB_ENV
echo "OUTPUT_FILE=training.log" >> $GITHUB_ENV
- name: Install Python dependencies
if: inputs.script_type == 'py'
run: |
pip install numpy
# pip install torch - need to find a way to cache this otherwise it will take a long time to install
# Add other Python dependencies as needed
- name: Install CUDA dependencies
if: inputs.script_type == 'cu'
run: |
sudo apt-get update
sudo apt-get install -y nvidia-cuda-toolkit
nvcc --version
- name: Create and run training script
- name: Create training script
run: |
echo "${{ inputs.script_content }}" > train.py
cat train.py # Debug: print the content
python train.py > training.log 2>&1
echo "${{ inputs.script_content }}" > ${{ env.SCRIPT_FILE }}
cat ${{ env.SCRIPT_FILE }} # Debug: print the content
- name: Compile and run CUDA script
if: inputs.script_type == 'cu'
run: |
nvcc ${{ env.SCRIPT_FILE }} -o train_cuda
./train_cuda > ${{ env.OUTPUT_FILE }} 2>&1
- name: Run Python script
if: inputs.script_type == 'py'
run: |
python ${{ env.SCRIPT_FILE }} > ${{ env.OUTPUT_FILE }} 2>&1
- name: Upload logs
uses: actions/upload-artifact@v3
if: always() # Upload logs whether the job succeeds or fails
if: always()
with:
name: training-logs
path: training.log
path: ${{ env.OUTPUT_FILE }}
100 changes: 53 additions & 47 deletions discord-bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,56 @@ async def on_ready():
except Exception as e:
logger.warning(f'Failed to update nickname in guild {guild.name}: {e}')


async def process_python(message, attachment):
# Reply to the original message
initial_reply = await message.reply("Found train.py! Starting training process...")

# Create a new thread from the reply
thread = await initial_reply.create_thread(
name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
auto_archive_duration=1440 # Archive after 24 hours of inactivity
)

try:
# Download the file content
logger.info("Downloading train.py content")
script_content = await attachment.read()
script_content = script_content.decode('utf-8')
logger.info("Successfully read train.py content")

# Trigger GitHub Action
run_id = await trigger_github_action(script_content)

if run_id:
logger.info(f"Successfully triggered workflow with run ID: {run_id}")
await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")

# Monitor the workflow
status, logs, url = await check_workflow_status(run_id, thread)

# Send results back to Discord thread
await thread.send(f"Training completed with status: {status}")

# Split logs if they're too long for Discord's message limit
if len(logs) > 1900:
chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
for i, chunk in enumerate(chunks):
await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
else:
await thread.send(f"```\nLogs:\n{logs}\n```")

if url:
await thread.send(f"View the full run at: {url}")
else:
logger.error("Failed to trigger GitHub Action")
await thread.send("Failed to trigger GitHub Action. Please check the configuration.")

except Exception as e:
logger.error(f"Error processing request: {str(e)}", exc_info=True)
await thread.send(f"Error processing request: {str(e)}")


@client.event
async def on_message(message):
# Ignore messages from the bot itself
Expand All @@ -185,54 +235,10 @@ async def on_message(message):
for attachment in message.attachments:
logger.info(f"Processing attachment: {attachment.filename}")
if attachment.filename == "train.py":
# Reply to the original message
initial_reply = await message.reply("Found train.py! Starting training process...")

# Create a new thread from the reply
thread = await initial_reply.create_thread(
name=f"Training Job - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
auto_archive_duration=1440 # Archive after 24 hours of inactivity
)

try:
# Download the file content
logger.info("Downloading train.py content")
script_content = await attachment.read()
script_content = script_content.decode('utf-8')
logger.info("Successfully read train.py content")

# Trigger GitHub Action
run_id = await trigger_github_action(script_content)

if run_id:
logger.info(f"Successfully triggered workflow with run ID: {run_id}")
await thread.send(f"GitHub Action triggered successfully! Run ID: {run_id}\nMonitoring progress...")

# Monitor the workflow
status, logs, url = await check_workflow_status(run_id, thread)

# Send results back to Discord thread
await thread.send(f"Training completed with status: {status}")

# Split logs if they're too long for Discord's message limit
if len(logs) > 1900:
chunks = [logs[i:i+1900] for i in range(0, len(logs), 1900)]
for i, chunk in enumerate(chunks):
await thread.send(f"```\nLogs (part {i+1}/{len(chunks)}):\n{chunk}\n```")
else:
await thread.send(f"```\nLogs:\n{logs}\n```")

if url:
await thread.send(f"View the full run at: {url}")
else:
logger.error("Failed to trigger GitHub Action")
await thread.send("Failed to trigger GitHub Action. Please check the configuration.")

except Exception as e:
logger.error(f"Error processing request: {str(e)}", exc_info=True)
await thread.send(f"Error processing request: {str(e)}")

process_python(message, attachment)
break
elif attachment.filename == "train.cu":
raise ValueError("CUDA training is not supported yet")

if not any(att.filename == "train.py" for att in message.attachments):
await message.reply("Please attach a file named 'train.py' to your message.")
Expand Down

0 comments on commit e474abe

Please sign in to comment.