Skip to content

Commit

Permalink
Merge pull request #8 from sangshuduo/feat/sangshuduo/random-pairs-wi…
Browse files Browse the repository at this point in the history
…th-s3

feat(commit): Add random_pairs_of_s3file tool
  • Loading branch information
sangshuduo authored Jan 13, 2025
2 parents e9b2377 + d1c2765 commit 609e57b
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ Cargo.lock
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.mentat
.ai-commit.json
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ members = [
"hit_rate_converter",
"cat_xlsx",
"find_files_in_list",
"random_pairs_of_s3file"
# Add other tools here
]
resolver = "2" # Add this line to specify resolver version 2
13 changes: 13 additions & 0 deletions random_pairs_of_s3file/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "random_pairs_of_s3file"
version = "0.1.0"
edition = "2021"

[dependencies]
aws-config = "1.5.13"
aws-sdk-s3 = "1.68.0"
tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] }
rand = "0.8"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
clap = { version = "4.2", features = ["derive"] }
12 changes: 12 additions & 0 deletions random_pairs_of_s3file/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
## random_pairs_of_s3file Usage:

```shell
Usage: random_pairs_of_s3file [OPTIONS] --num-pairs <NUM> --bucket <BUCKET> --directory <DIR> --url-prefix <PREFIX>

Options:
--num-pairs <NUM> Number of pairs to generate
--bucket <BUCKET> Name of the S3 bucket
--directory <DIR> Directory (prefix) in the bucket (e.g. "image/")
--url-prefix <PREFIX> URL prefix for final URLs
-h, --help Print help
-V, --version Print version
143 changes: 143 additions & 0 deletions random_pairs_of_s3file/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
use clap::Parser;
use rand::seq::SliceRandom;
use serde::Serialize;
use std::error::Error;

// AWS SDK for Rust (1.x)
use aws_config::{load_defaults, BehaviorVersion};
use aws_sdk_s3::error::SdkError;
use aws_sdk_s3::types::Object;
use aws_sdk_s3::Client;

/// Command-line arguments (all required, no defaults)
#[derive(Parser, Debug)]
#[command(author, version, about)]
struct Args {
/// Number of pairs to generate
#[arg(long, required = true)]
num_pairs: usize,

/// Name of the S3 bucket
#[arg(long, required = true)]
bucket: String,

/// Directory (prefix) in the bucket (e.g. "image/")
#[arg(long, required = true)]
directory: String,

/// URL prefix to form the final URL (e.g. "https://api.example.com/s3/api/v1/resource?url=s3://")
#[arg(long, required = true)]
url_prefix: String,
}

#[derive(Serialize)]
struct PairsOutput {
pairs: Vec<Pair>,
}

#[derive(Serialize)]
struct Pair {
source: String,
candidate: String,
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let args = Args::parse();

let num_pairs = args.num_pairs;
let bucket_name = &args.bucket;
let directory_prefix = &args.directory;
let url_prefix = &args.url_prefix;

let shared_config = load_defaults(BehaviorVersion::latest()).await;
let s3_client = Client::new(&shared_config);

let resp = s3_client
.list_objects_v2()
.bucket(bucket_name)
.prefix(directory_prefix)
.send()
.await;

let output = match resp {
Ok(o) => o,
Err(SdkError::ServiceError(e)) => {
eprintln!("Service error: {:#?}", e);
return Ok(());
}
Err(e) => {
eprintln!("Other error listing objects: {:?}", e);
return Ok(());
}
};

// Extract all object keys
let objects: &[Object] = output.contents();
let all_keys: Vec<String> = objects
.iter()
.filter_map(|obj| obj.key().map(str::to_string))
.collect();

if all_keys.len() < 2 {
eprintln!(
"Not enough objects to generate pairs. Found only {} object(s).",
all_keys.len()
);
return Ok(());
}

// Generate all unique pairs (source, candidate) where source != candidate
let mut all_pairs = Vec::new();
for (i, source) in all_keys.iter().enumerate() {
// check if source is empty
if source.is_empty() {
continue;
}
for (j, candidate) in all_keys.iter().enumerate() {
// check if candidate is is_empty
if candidate.is_empty() {
continue;
}
if i != j {
all_pairs.push(Pair {
source: format!("{}{}/{}", url_prefix, bucket_name, source),
candidate: format!("{}{}/{}", url_prefix, bucket_name, candidate),
});
}
}
}

let max_pairs_possible = all_pairs.len();
if num_pairs > max_pairs_possible {
eprintln!(
"Requested {} pairs, but only {} unique pairs can be generated with {} objects.",
num_pairs,
max_pairs_possible,
all_keys.len()
);
}

// Shuffle and take the requested number of pairs
let mut rng = rand::thread_rng();
all_pairs.shuffle(&mut rng);

let selected_pairs: Vec<Pair> = all_pairs.into_iter().take(num_pairs).collect();

if selected_pairs.len() < num_pairs {
eprintln!(
"Requested {} pairs, but only {} unique pairs could be generated with {} objects.",
num_pairs,
selected_pairs.len(),
all_keys.len()
);
}

// Print JSON output
let output_json = PairsOutput {
pairs: selected_pairs,
};
println!("{}", serde_json::to_string_pretty(&output_json)?);

Ok(())
}

0 comments on commit 609e57b

Please sign in to comment.