Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for local csv file as input #106

Merged
merged 4 commits into from
Apr 13, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 112 additions & 56 deletions criticality_score/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,27 +431,11 @@ def get_param_score(param, max_value, weight=1):
return (math.log(1 + param) / math.log(1 + max(param, max_value))) * weight


def get_repository_stats(repo, additional_params=None):
"""Return repository stats, including criticality score."""
# Validate and compute additional params first.
def get_repository_stats(repo):
"""Return repository stats by grabing the raw signal data from the repo."""
if not repo.last_commit:
logger.error(f'Repo is empty: {repo.url}')
return None
if additional_params is None:
additional_params = []
additional_params_total_weight = 0
additional_params_score = 0
for additional_param in additional_params:
try:
value, weight, max_threshold = [
int(i) for i in additional_param.split(':')
]
except ValueError:
logger.error('Parameter value in bad format: ' + additional_param)
sys.exit(1)
additional_params_total_weight += weight
additional_params_score += get_param_score(value, max_threshold,
weight)

def _worker(repo, param, return_dict):
"""worker function"""
Expand All @@ -476,6 +460,28 @@ def _worker(repo, param, return_dict):
for param in PARAMS:
result_dict[param] = return_dict[param]

return result_dict


def get_repository_score(repo_stats, additional_params=None):
"""Return one repository's criticality score based on repo stats."""
# Validate and compute additional params first.
if additional_params is None:
additional_params = []
additional_params_total_weight = 0
additional_params_score = 0
for additional_param in additional_params:
try:
value, weight, max_threshold = [
float(i) for i in additional_param.split(':')
]
except ValueError:
logger.error('Parameter value in bad format: ' + additional_param)
sys.exit(1)
additional_params_total_weight += weight
additional_params_score += get_param_score(value, max_threshold,
weight)

total_weight = (CREATED_SINCE_WEIGHT + UPDATED_SINCE_WEIGHT +
CONTRIBUTOR_COUNT_WEIGHT + ORG_COUNT_WEIGHT +
COMMIT_FREQUENCY_WEIGHT + RECENT_RELEASES_WEIGHT +
Expand All @@ -484,36 +490,72 @@ def _worker(repo, param, return_dict):
additional_params_total_weight)

criticality_score = round(
((get_param_score(result_dict['created_since'],
CREATED_SINCE_THRESHOLD, CREATED_SINCE_WEIGHT)) +
(get_param_score(result_dict['updated_since'],
UPDATED_SINCE_THRESHOLD, UPDATED_SINCE_WEIGHT)) +
(get_param_score(result_dict['contributor_count'],
CONTRIBUTOR_COUNT_THRESHOLD,
CONTRIBUTOR_COUNT_WEIGHT)) +
(get_param_score(result_dict['org_count'], ORG_COUNT_THRESHOLD,
ORG_COUNT_WEIGHT)) +
(get_param_score(result_dict['commit_frequency'],
COMMIT_FREQUENCY_THRESHOLD,
COMMIT_FREQUENCY_WEIGHT)) +
(get_param_score(result_dict['recent_releases_count'],
RECENT_RELEASES_THRESHOLD, RECENT_RELEASES_WEIGHT)) +
(get_param_score(result_dict['closed_issues_count'],
CLOSED_ISSUES_THRESHOLD, CLOSED_ISSUES_WEIGHT)) +
(get_param_score(result_dict['updated_issues_count'],
UPDATED_ISSUES_THRESHOLD, UPDATED_ISSUES_WEIGHT)) +
(get_param_score(
result_dict['comment_frequency'], COMMENT_FREQUENCY_THRESHOLD,
COMMENT_FREQUENCY_WEIGHT)) + (get_param_score(
result_dict['dependents_count'], DEPENDENTS_COUNT_THRESHOLD,
DEPENDENTS_COUNT_WEIGHT)) + additional_params_score) /
((get_param_score(float(repo_stats['created_since']),
CREATED_SINCE_THRESHOLD, CREATED_SINCE_WEIGHT)) +
(get_param_score(float(repo_stats['updated_since']),
UPDATED_SINCE_THRESHOLD, UPDATED_SINCE_WEIGHT)) +
(get_param_score(float(repo_stats['contributor_count']),
CONTRIBUTOR_COUNT_THRESHOLD, CONTRIBUTOR_COUNT_WEIGHT)) +
(get_param_score(float(repo_stats['org_count']),
ORG_COUNT_THRESHOLD, ORG_COUNT_WEIGHT)) +
(get_param_score(float(repo_stats['commit_frequency']),
COMMIT_FREQUENCY_THRESHOLD, COMMIT_FREQUENCY_WEIGHT)) +
(get_param_score(float(repo_stats['recent_releases_count']),
RECENT_RELEASES_THRESHOLD, RECENT_RELEASES_WEIGHT)) +
(get_param_score(float(repo_stats['closed_issues_count']),
CLOSED_ISSUES_THRESHOLD, CLOSED_ISSUES_WEIGHT)) +
(get_param_score(float(repo_stats['updated_issues_count']),
UPDATED_ISSUES_THRESHOLD, UPDATED_ISSUES_WEIGHT)) +
(get_param_score(float(repo_stats['comment_frequency']),
COMMENT_FREQUENCY_THRESHOLD, COMMENT_FREQUENCY_WEIGHT)) +
(get_param_score(float(repo_stats['dependents_count']),
DEPENDENTS_COUNT_THRESHOLD, DEPENDENTS_COUNT_WEIGHT)) +
additional_params_score) /
total_weight, 5)

# Make sure score between 0 (least-critical) and 1 (most-critical).
criticality_score = max(min(criticality_score, 1), 0)

result_dict['criticality_score'] = criticality_score
return result_dict
return criticality_score


def get_repository_score_from_raw_stats(repo_url, params=None):
"""Get repository's criticality_score based on raw signal data."""
repo = get_repository(repo_url)
if repo is None:
logger.error(f'Repo is not found: {repo_url}')
return
repo_stats = get_repository_stats(repo)
repo_stats["criticality_score"] = get_repository_score(repo_stats, params)

return repo_stats


def get_repository_score_from_local_csv(file_path, params=None):
"""Get repository's criticality_score based on a local csv file."""
if params is None:
additional_params = []
else:
try:
additional_params = [additional_param.split(':') for additional_param in params]
except ValueError:
logger.error('Parameter value in bad format: ' + params)
sys.exit(1)

output = []
with open(file_path, "r") as fd:
input_data = csv.DictReader(fd, delimiter=',')
for row in input_data:
logger.debug(row)

calc_params = []
for key,weight,max_threshold in additional_params:
calc_params.append(f"{row[key]}:{weight}:{max_threshold}")

row["criticality_score"] = get_repository_score(row, calc_params)
output.append(row)

return output


def get_github_token_info(token_obj):
Expand Down Expand Up @@ -581,6 +623,7 @@ def get_repository(url):
try:
repo = get_github_auth_token().get_repo(repo_url)
except github.GithubException as exp:
logger.error(f"Get github repository failed: {exp}")
if exp.status == 404:
return None
return GitHubRepository(repo)
Expand Down Expand Up @@ -667,10 +710,14 @@ def override_params(override_params):
def main():
parser = argparse.ArgumentParser(
description='Gives criticality score for an open source project')
parser.add_argument("--repo",
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--repo",
type=str,
required=True,
help="repository url")
group.add_argument("--local-file",
type=str,
dest="l_file",
help="path of a local csv file with repo stats")
parser.add_argument(
"--format",
type=str,
Expand All @@ -681,7 +728,7 @@ def main():
'--params',
nargs='+',
default=[],
help='Additional parameters in form <value>:<weight>:<max_threshold>',
help='Additional parameters in form (<value>|<key>):<weight>:<max_threshold>, <value> for repo, and <key> for loacl file',
required=False)
parser.add_argument(
'--overrides',
Expand All @@ -691,26 +738,35 @@ def main():
required=False)

initialize_logging_handlers()

args = parser.parse_args()
if args.overrides:
override_params(args.overrides)
repo = get_repository(args.repo)
if not repo:
logger.error(f'Repo is not found: {args.repo}')
return
output = get_repository_stats(repo, args.params)
if not output:

output = None
if args.repo:
calebbrown marked this conversation as resolved.
Show resolved Hide resolved
output = get_repository_score_from_raw_stats(args.repo, args.params)
elif args.l_file:
if args.format != "csv":
logger.error(f"Only support for the format of csv, now is {args.format}")
sys.exit(1)

output = get_repository_score_from_local_csv(args.l_file, args.params)
else:
raise Exception("Unknown data input type")

if output is None:
return
if args.format == 'default':
for key, value in output.items():
logger.info(f'{key}: {value}')
elif args.format == 'json':
logger.info(json.dumps(output, indent=4))
elif args.format == 'csv':
csv_writer = csv.writer(sys.stdout)
csv_writer.writerow(output.keys())
csv_writer.writerow(output.values())
if args.repo:
output = [output]
csv_writer = csv.DictWriter(sys.stdout, fieldnames=output[0].keys())
csv_writer.writeheader()
csv_writer.writerows(output)
else:
raise Exception(
'Wrong format argument, use one of default, csv or json!')
Expand Down