-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from sangshuduo/feat/sangshuduo/random-pairs-wi…
…th-s3 feat(commit): Add random_pairs_of_s3file tool
- Loading branch information
Showing
5 changed files
with
170 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[package] | ||
name = "random_pairs_of_s3file" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
[dependencies] | ||
aws-config = "1.5.13" | ||
aws-sdk-s3 = "1.68.0" | ||
tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] } | ||
rand = "0.8" | ||
serde = { version = "1.0", features = ["derive"] } | ||
serde_json = "1.0" | ||
clap = { version = "4.2", features = ["derive"] } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
## random_pairs_of_s3file Usage: | ||
|
||
```shell | ||
Usage: random_pairs_of_s3file [OPTIONS] --num-pairs <NUM> --bucket <BUCKET> --directory <DIR> --url-prefix <PREFIX> | ||
|
||
Options: | ||
--num-pairs <NUM> Number of pairs to generate | ||
--bucket <BUCKET> Name of the S3 bucket | ||
--directory <DIR> Directory (prefix) in the bucket (e.g. "image/") | ||
--url-prefix <PREFIX> URL prefix for final URLs | ||
-h, --help Print help | ||
-V, --version Print version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
use clap::Parser; | ||
use rand::seq::SliceRandom; | ||
use serde::Serialize; | ||
use std::error::Error; | ||
|
||
// AWS SDK for Rust (1.x) | ||
use aws_config::{load_defaults, BehaviorVersion}; | ||
use aws_sdk_s3::error::SdkError; | ||
use aws_sdk_s3::types::Object; | ||
use aws_sdk_s3::Client; | ||
|
||
/// Command-line arguments (all required, no defaults) | ||
#[derive(Parser, Debug)] | ||
#[command(author, version, about)] | ||
struct Args { | ||
/// Number of pairs to generate | ||
#[arg(long, required = true)] | ||
num_pairs: usize, | ||
|
||
/// Name of the S3 bucket | ||
#[arg(long, required = true)] | ||
bucket: String, | ||
|
||
/// Directory (prefix) in the bucket (e.g. "image/") | ||
#[arg(long, required = true)] | ||
directory: String, | ||
|
||
/// URL prefix to form the final URL (e.g. "https://api.example.com/s3/api/v1/resource?url=s3://") | ||
#[arg(long, required = true)] | ||
url_prefix: String, | ||
} | ||
|
||
#[derive(Serialize)] | ||
struct PairsOutput { | ||
pairs: Vec<Pair>, | ||
} | ||
|
||
#[derive(Serialize)] | ||
struct Pair { | ||
source: String, | ||
candidate: String, | ||
} | ||
|
||
#[tokio::main] | ||
async fn main() -> Result<(), Box<dyn Error>> { | ||
let args = Args::parse(); | ||
|
||
let num_pairs = args.num_pairs; | ||
let bucket_name = &args.bucket; | ||
let directory_prefix = &args.directory; | ||
let url_prefix = &args.url_prefix; | ||
|
||
let shared_config = load_defaults(BehaviorVersion::latest()).await; | ||
let s3_client = Client::new(&shared_config); | ||
|
||
let resp = s3_client | ||
.list_objects_v2() | ||
.bucket(bucket_name) | ||
.prefix(directory_prefix) | ||
.send() | ||
.await; | ||
|
||
let output = match resp { | ||
Ok(o) => o, | ||
Err(SdkError::ServiceError(e)) => { | ||
eprintln!("Service error: {:#?}", e); | ||
return Ok(()); | ||
} | ||
Err(e) => { | ||
eprintln!("Other error listing objects: {:?}", e); | ||
return Ok(()); | ||
} | ||
}; | ||
|
||
// Extract all object keys | ||
let objects: &[Object] = output.contents(); | ||
let all_keys: Vec<String> = objects | ||
.iter() | ||
.filter_map(|obj| obj.key().map(str::to_string)) | ||
.collect(); | ||
|
||
if all_keys.len() < 2 { | ||
eprintln!( | ||
"Not enough objects to generate pairs. Found only {} object(s).", | ||
all_keys.len() | ||
); | ||
return Ok(()); | ||
} | ||
|
||
// Generate all unique pairs (source, candidate) where source != candidate | ||
let mut all_pairs = Vec::new(); | ||
for (i, source) in all_keys.iter().enumerate() { | ||
// check if source is empty | ||
if source.is_empty() { | ||
continue; | ||
} | ||
for (j, candidate) in all_keys.iter().enumerate() { | ||
// check if candidate is is_empty | ||
if candidate.is_empty() { | ||
continue; | ||
} | ||
if i != j { | ||
all_pairs.push(Pair { | ||
source: format!("{}{}/{}", url_prefix, bucket_name, source), | ||
candidate: format!("{}{}/{}", url_prefix, bucket_name, candidate), | ||
}); | ||
} | ||
} | ||
} | ||
|
||
let max_pairs_possible = all_pairs.len(); | ||
if num_pairs > max_pairs_possible { | ||
eprintln!( | ||
"Requested {} pairs, but only {} unique pairs can be generated with {} objects.", | ||
num_pairs, | ||
max_pairs_possible, | ||
all_keys.len() | ||
); | ||
} | ||
|
||
// Shuffle and take the requested number of pairs | ||
let mut rng = rand::thread_rng(); | ||
all_pairs.shuffle(&mut rng); | ||
|
||
let selected_pairs: Vec<Pair> = all_pairs.into_iter().take(num_pairs).collect(); | ||
|
||
if selected_pairs.len() < num_pairs { | ||
eprintln!( | ||
"Requested {} pairs, but only {} unique pairs could be generated with {} objects.", | ||
num_pairs, | ||
selected_pairs.len(), | ||
all_keys.len() | ||
); | ||
} | ||
|
||
// Print JSON output | ||
let output_json = PairsOutput { | ||
pairs: selected_pairs, | ||
}; | ||
println!("{}", serde_json::to_string_pretty(&output_json)?); | ||
|
||
Ok(()) | ||
} |