diff --git a/clone_repos.sh b/clone_repos.sh index a859c30..fbe1853 100755 --- a/clone_repos.sh +++ b/clone_repos.sh @@ -1,15 +1,20 @@ #!/bin/bash # Usage: bash ./clone_repos.sh file.csv CubitCodeReview -# +# +# clone_archived: "0" (clone only archived), "1" (clone only not archived) or "2" (clone all) INPUT=${1:-"file.csv"} ORG=${2:-"org"} +BACKUP_DIR=${3:-".."} +ARCHIVE_CLONE=${4:-"1"} +BACKUP_ORG_DIR=$BACKUP_DIR/$ORG OLDIFS=$IFS IFS=';' + if [ ! -d "$ORG" ]; then - mkdir -p $ORG + mkdir -p $BACKUP_ORG_DIR fi dir=`pwd` @@ -19,17 +24,23 @@ dir=`pwd` exit 99 } i=1 -while read name priv issues perms; do +while read name archived has_issues has_wiki is_private; do if [ "$i" == '1' ]; then i=0 continue # skip column's name fi + if [ "$archived" == "$ARCHIVE_CLONE" ]; then + echo "REPO $name (skip by ARCHIVE_CLONE setting). SKIPPING." + echo "__________________________________________" + continue # skip archived repo + fi LINK="git@github.com:$ORG/$name.git" - if [ ! -d "$ORG/$name" ]; then - git clone $LINK $ORG/$name + BACKUP_REPO_DIR=$BACKUP_ORG_DIR/$name + if [ ! -d "$BACKUP_REPO_DIR" ]; then + git clone $LINK $BACKUP_REPO_DIR else - echo "REPO $name (`pwd`/$ORG/$name) EXISTS. FETCHING." - cd $ORG/$name + echo "REPO $name ($BACKUP_REPO_DIR) EXISTS. FETCHING." + cd $BACKUP_REPO_DIR git fetch -a cd $dir fi diff --git a/export_org_repos.py b/export_org_repos.py index 0968847..1757602 100644 --- a/export_org_repos.py +++ b/export_org_repos.py @@ -1,81 +1,101 @@ #!/bin/python3 -# usage: python3 export_org_repos.py --token --github_nickname --orgs +# usage: python3 export_org_repos.py --token --orgs import argparse -from github import Github -import json +from github.Repository import Repository import csv -import requests +from json import dump as json_dump from time import sleep +from utils import get_github_client, get_lines_from_file + def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--token', type=str, required=True, - dest='token', - help='file w/github token') - parser.add_argument('--github_nickname', type=str, required=True, - dest='github_nickname', - help='organization_names_file') - parser.add_argument('--orgs', type=str, required=True, - dest='orgs', - help='organization_names_file') + parser.add_argument( + "--token", type=str, required=True, dest="token", help="file w/github token" + ) + parser.add_argument( + "--orgs", type=str, required=True, dest="orgs", help="organization_names_file" + ) + parser.add_argument( + "--verbose", action="store_true", dest="verbose", help="verbose result" + ) results = parser.parse_args() return results -def get_token(filename): - with open(filename) as file: - token = file.readline().strip() - return token +def get_writer_rows(verbose=False): + headers = "repo_name,archived,issues_count,has_wiki,is_private,last_pushed_at,size,pr_count,users_count,permissions".split( + "," + ) + return headers if verbose else headers[:5] -def get_orgs(filename): - with open(filename) as file: - orgs = (org.strip() for org in file.readlines() if org.strip()) - return orgs +def get_repo_info(repo: Repository, verbose=False): + pr_count, issues_count = 0, 0 -def get_writer_rows(): - return [ - "repo_name", - "is_private", - "issues_count", - "permissions" - ] + if repo.has_issues: + all_issues = tuple( + repo.get_issues(state="all") + ) # TODO: use totalCount after release + issues_count = sum(not issue.pull_request for issue in all_issues) + pr_count = len(all_issues) - issues_count + info = { + "repo_name": repo.name, + "is_private": int(repo.private), + "archived": int(repo.archived), + "has_wiki": int(repo.has_wiki), + "issues_count": issues_count, + } + if verbose: + users = "" + try: + users_info = repo.get_collaborators() + users_count = users_info.totalCount + for u in users_info: + users += f"{u.login}:{str(u.permissions)}," + except Exception as exc: + print(f"Error getting collaborators: {exc}") + info.update( + { + "last_pushed_at": repo.pushed_at.strftime(r"%d.%m.%y %H:%M:%S"), + "size": repo.size, + "pr_count": pr_count, + "users_count": users_count, + "permissions": users, + } + ) + return info -def get_repo_info(repo, org_name="", username=""): - users = "" - try: - for u in repo.get_collaborators(): - # params = { - # "accept": "application/vnd.github.v3+json" - # } - # sleep(1) - # res = requests.get(f"https://api.github.com/repos/{org_name}/{repo.name}/collaborators/{u.login}/permission", params=params, auth=(username, args.token)) - - users += f"{u.login}:{str(u.permissions)}," - # print(users) - except Exception as exc: - print(f"Error getting collaborators: {exc}") - return [ - str(repo.name), - str(repo.private), - str(len(list(repo.get_issues(state='all')))), - str(users) - ] +if __name__ == "__main__": + args = parse_args() + g = get_github_client(args.token) + orgs_data = {} -if __name__ == '__main__': - args = parse_args() - g = Github(get_token(args.token)) - for org_name in get_orgs(args.orgs): - print('get org [{}]'.format(org_name)) + for org_name in get_lines_from_file(args.orgs): + print(f"get org [{org_name}]") + orgs_data[org_name] = [] org = g.get_organization(org_name) - org_repos = org.get_repos() - with open(f'{org_name}.csv', 'w') as file: - writer = csv.writer(file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) - writer.writerow(get_writer_rows()) - for repo in org_repos: + + with open(f"{org_name}.csv", "w", newline="") as file: + writer = csv.DictWriter( + file, + fieldnames=get_writer_rows(args.verbose), + delimiter=";", + quotechar="|", + quoting=csv.QUOTE_MINIMAL, + ) + writer.writeheader() + + repos = org.get_repos() + for repo in repos: print(f"Handling repo [{repo.name}]") -# sleep(10) - info = get_repo_info(repo, org_name, args.github_nickname) + info = get_repo_info(repo, args.verbose) + orgs_data[org_name].append(info) writer.writerow(info) + sleep(0.1) + + if args.verbose: + with open("orgs_info.json", "w", encoding="utf-8") as file: + json_dump(orgs_data, file, ensure_ascii=False, indent=4) diff --git a/issues_backup.py b/issues_backup.py index 0a4659e..7785313 100644 --- a/issues_backup.py +++ b/issues_backup.py @@ -2,31 +2,37 @@ # usage: python3 issues_backup.py --token --repos import os -from github import Github +from github import Issue import argparse import csv -from json import dump, load +from json import dump import os.path as path import glob from time import sleep from datetime import datetime import pytz +from utils import get_github_client -utc=pytz.UTC +utc = pytz.UTC + +DELAY = 1 # Delay to avoiding reach of Github API limit -DELAY=1 # Delay to avoiding reach of Github API limit def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--token', type=str, required=True, - dest='token', - help='file with github token') - parser.add_argument('--repos', type=str, required=True, - dest='repos', - help='csv file w/repos list') - parser.add_argument('--force', action='store_true', required=False, - dest='force', - help='force rewrite issues') + parser.add_argument( + "--token", type=str, required=True, dest="token", help="file with github token" + ) + parser.add_argument( + "--repos", type=str, required=True, dest="repos", help="csv file w/repos list" + ) + parser.add_argument( + "--force", + action="store_true", + required=False, + dest="force", + help="force rewrite issues", + ) results = parser.parse_args() return results @@ -38,88 +44,95 @@ def get_checked_repos(path): return res -def get_token(filename): - with open(filename) as file: - token = file.readline().strip() - return token - - def get_repos(filename): repos = [] with open(filename) as file: - reader = csv.reader(file, delimiter=';', quotechar='|') - next(reader, None) + reader = csv.DictReader(file, delimiter=";", quotechar="|") print("REPOS:") for row in reader: - repos.append((row[:1][0], int(row[2]))) + repos.append( + (row["repo_name"], bool(int(row["archived"])), int(row["issues_count"])) + ) return repos -def get_issue_info(issue): +def get_issue_info(issue: Issue.Issue): return { - 'id': issue.id, - 'title': issue.title, - 'assignees': [assignee.login for assignee in issue.assignees], - 'created_at': str(issue.created_at), - 'labels': [label.name for label in issue.get_labels()], - 'state': issue.state, - 'user': issue.user.login + "id": issue.id, + "title": issue.title, + "assignees": [assignee.login for assignee in issue.assignees], + "created_at": issue.created_at.strftime(r"%d.%m.%y %H:%M:%S"), + "labels": [label.name for label in issue.get_labels()], + "state": issue.state, + "user": issue.user.login, + "body": issue.body, + "is_pr": bool(issue.pull_request) } def get_issues_info(repo): - return repo.get_issues(state='all') + return repo.get_issues(state="all") -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() - g = Github(get_token(args.token)) - org_name = args.repos.split('.')[0] + g = get_github_client(args.token) + org_name = args.repos.split(".")[0] checked_repos = [] if not path.exists(org_name): os.mkdir(org_name) else: checked_repos = get_checked_repos(org_name) repos = get_repos(args.repos) - i=0 - for repo_item in repos: - reponame = repo_item[0] - issues_count = repo_item[1] - print(f"Processing {reponame} with {issues_count} issues {i}/{len(repos)}") - i=i+1 - if issues_count == 0: + + for i, repo_item in enumerate(repos, start=1): + reponame, is_archived, issues_count = repo_item + print( + f"Processing {i}/{len(repos)}: {reponame}, archived = {is_archived}, issues_count = {issues_count}" + ) + if not issues_count: print(f"Skipping {reponame} (zero issues)...") continue - - if (not args.force) and (reponame in checked_repos): + elif is_archived: + print(f"Skipping {reponame} (archived repo)...") + continue + elif (not args.force) and (reponame in checked_repos): print(f"Skipping {reponame} (backup exists)...") continue - sleep(DELAY) while True: try: full_reponame = f"{org_name}/{reponame}" - print('Recieving data for {}'.format(full_reponame)) + print("Recieving data for {}".format(full_reponame)) repo = g.get_repo(full_reponame) - file_name = '{}/{}.issues.json'.format(org_name, reponame.replace('/', '--')) + file_name = "{}/{}.issues.json".format( + org_name, reponame.replace("/", "--") + ) if path.exists(file_name): repo_updated_at = repo.updated_at.replace(tzinfo=utc) - file_updated = datetime.fromtimestamp(path.getmtime(file_name)).replace(tzinfo=utc) - print(f"Repo {full_reponame} updated at {repo_updated_at}, file updated at {file_updated}") + file_updated = datetime.fromtimestamp( + path.getmtime(file_name) + ).replace(tzinfo=utc) + print( + f"Repo {full_reponame} updated at {repo_updated_at}, file updated at {file_updated}" + ) if repo_updated_at < file_updated: - print(f"Repo {full_reponame} does not have new changes, skipping") + print( + f"Repo {full_reponame} does not have new changes, skipping" + ) break else: print(f"Repo {full_reponame} has new changes, need to backup") else: print(f"File for repo {full_reponame} does not exist") - sleep(DELAY) issues = get_issues_info(repo) issues_info = [] for issue in issues: + if not issue.pull_request: + # skip PR, that also in issues + issues_info.append(get_issue_info(issue)) sleep(DELAY) - issues_info.append(get_issue_info(issue)) - - with open(file_name, 'w') as file: + # TODO: check, that issues_info isn't empty + with open(file_name, "w") as file: dump(issues_info, file, ensure_ascii=False, indent=3) break @@ -127,5 +140,6 @@ def get_issues_info(repo): ## Sleep 1h print("Got exception, will wait for 1h and continue") print(e) - sleep(60*61) + sleep(60 * 61) continue + sleep(DELAY) diff --git a/main.sh b/main.sh old mode 100644 new mode 100755 index 01fdcbe..b001f94 --- a/main.sh +++ b/main.sh @@ -2,27 +2,28 @@ # Requirements # ./token - contains github token -# ./username - contains github username (token owner) # ./orgs - orgs list (separated with newline) +# BACKUP_DIR: optional. default = ".." echo "Running export_org_repos.py" -python3 ./export_org_repos.py --token ./token --github_nickname `cat ./username` --orgs ./orgs +python3 ./export_org_repos.py --token ./token --orgs ./orgs +BACKUP_DIR=${1:-".."} echo "Cloning repos" while IFS= read -r org; do echo "Processing $org" echo "Cloning repos of $org" - ./clone_repos.sh $org.csv $org + ./clone_repos.sh $org.csv $org $BACKUP_DIR echo "Cloning wikis of $org" - ../wiki_saver/csv_to_plain_list.sh $org.csv ${org}_list.txt ${org} - ../wiki_saver/save_wiki.sh ${org}_list.txt + ./wiki_saver/csv_to_wiki_list.sh $org.csv ${org}_list.txt ${org} + ./wiki_saver/save_wiki.sh ${org}_list.txt echo "Backing up issues of $org" - python3.8 ./issues_backup.py --token token --repos $org.csv --force + python3 ./issues_backup.py --token token --repos $org.csv --force done <./orgs read -n 1 -s -r -p "Press any key to continue" diff --git a/requirements.txt b/requirements.txt index c833144..b889660 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -PyGithub==1.55 +PyGithub==2.8.1 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..bed2b22 --- /dev/null +++ b/utils.py @@ -0,0 +1,16 @@ +from github import Github, Auth + + +def get_token(filename): + with open(filename) as file: + token = file.readline().strip() + return token + + +def get_github_client(token_filepath): + return Github(auth=Auth.Token(get_token(token_filepath))) + + +def get_lines_from_file(filename): + with open(filename) as file: + return (line.strip() for line in file.readlines() if line.strip()) diff --git a/wiki_saver/csv_to_wiki_list.sh b/wiki_saver/csv_to_wiki_list.sh new file mode 100755 index 0000000..0147448 --- /dev/null +++ b/wiki_saver/csv_to_wiki_list.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Transforms CSV list of repos (w/has_wiki flag) to plain list in format (only repos w/has_wiki=1): +# org/repo + + +src=${1} # Source file (csv) +dst=${2} # Destination file (plain text) +org=${3} # Organization + +tail -n +2 "${src}" | awk -F';' '$4 == "1" {print "'"${org}"'/"$1}' > ${dst} diff --git a/wiki_saver/save_wiki.sh b/wiki_saver/save_wiki.sh new file mode 100755 index 0000000..d18ba30 --- /dev/null +++ b/wiki_saver/save_wiki.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +repo_list=${1} + +while read line; +do + echo $line; + org=$(dirname $line) + mkdir -p ../wikis_${org} + + LINK="git@github.com:${line}.wiki.git" + BACKUP_REPO_DIR=../wikis_${line} + if [ ! -d "$BACKUP_REPO_DIR" ]; then + git clone $LINK $BACKUP_REPO_DIR + else + echo "REPO WIKI $name ($BACKUP_REPO_DIR) EXISTS. FETCHING." + cd $BACKUP_REPO_DIR + git fetch -a + cd - + fi + echo "__________________________________________" +done < ${repo_list} +