diff --git a/LICENSE b/LICENSE index 8dada3e..dbf8f6c 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright {yyyy} {name of copyright owner} + Copyright 2019-2021 Habib Rehman (https://git.io/HR) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/Pipfile b/Pipfile deleted file mode 100644 index 6257ab1..0000000 --- a/Pipfile +++ /dev/null @@ -1,9 +0,0 @@ -[[source]] -url = "https://pypi.python.org/simple" -verify_ssl = true -name = "pypi" - -[dev-packages] - -[packages] -requests = "*" \ No newline at end of file diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 44dc379..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,67 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "a0e63f8a0d1e3df046dc19b3ffbaaedfa151afc12af5a5b960ae7393952f8679" - }, - "host-environment-markers": { - "implementation_name": "cpython", - "implementation_version": "0", - "os_name": "posix", - "platform_machine": "x86_64", - "platform_python_implementation": "CPython", - "platform_release": "17.0.0", - "platform_system": "Darwin", - "platform_version": "Darwin Kernel Version 17.0.0: Thu Aug 24 21:48:19 PDT 2017; root:xnu-4570.1.46~2/RELEASE_X86_64", - "python_full_version": "2.7.14", - "python_version": "2.7", - "sys_platform": "darwin" - }, - "pipfile-spec": 6, - "requires": {}, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.python.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "certifi": { - "hashes": [ - "sha256:54a07c09c586b0e4c619f02a5e94e36619da8e2b053e20f594348c0611803704", - "sha256:40523d2efb60523e113b44602298f0960e900388cf3bb6043f645cf57ea9e3f5" - ], - "version": "==2017.7.27.1" - }, - "chardet": { - "hashes": [ - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691", - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae" - ], - "version": "==3.0.4" - }, - "idna": { - "hashes": [ - "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4", - "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f" - ], - "version": "==2.6" - }, - "requests": { - "hashes": [ - "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b", - "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e" - ], - "version": "==2.18.4" - }, - "urllib3": { - "hashes": [ - "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", - "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" - ], - "version": "==1.22" - } - }, - "develop": {} -} diff --git a/README.md b/README.md index 633e751..8206461 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,69 @@ -# github-clone -Recursively clone a GitHub repo sub-dir +# GitHub clone + +[![PyPi Downloads](https://img.shields.io/pypi/dm/github-clone.svg?style=for-the-badge&logo=pypi)](https://pypi.org/project/github-clone/) + +Git clone (download) any sub-directories of any GitHub repository (at any reference) without having to clone the entire repository, with only its GitHub URL. +Uses the GitHub API to recursively clone the sub-directories tree and files. + +## Motivation + +I often find myself wanting to only download a certain directory, path or package of an especially big repo that I'm currently viewing (without even cloning the entire repo at depth 1) and to do so by simply copy & pasting the GitHub URL so that's why. Probably more instances where this might come in handy ;) + +## Rate limit +The GitHub API imposes a [rate limiting](https://developer.github.com/v3/#rate-limiting) of up to 60 requests per hour applies but can be increased to up to 5000 requests per hour using an _OAuth token_ (to get one see https://help.github.com/en/articles/creating-a-personal-access-token-for-the-command-line). + +GitHub clone makes an initial request to fetch repo metadata and then, a request for every subfolder in the repo. The requests to download the files within the folders are not counted against the rate limit so in most cases (i.e. the folder/repo you're trying to clone has less than 60 subfolders) the rate limit should not be a problem. + +## Private repositories +To clone private repositories you need to supply an _OAuth token_ for an account with access to the private repository (to get one see https://help.github.com/en/articles/creating-a-personal-access-token-for-the-command-line). + +# Installation + +Available on PyPi https://pypi.org/project/github-clone/. + +Install the script via `pip`: +``` +pip install github-clone +``` +or via `pipsi`: +``` +pipsi install github-clone +``` +Uses Python 3.3+ + +# Usage +``` +GitHub clone (git.io/ghclone) + +Usage: + ghclone [-t | --token=] + ghclone (-h | --help) + ghclone (-v | --version) + +Examples: + ghclone https://github.com/HR/Crypter/tree/master/app + ghclone https://github.com/HR/Crypter/tree/dev/app + ghclone https://github.com/HR/Crypter/tree/v3.1.0/build + ghclone https://github.com/HR/Crypter/tree/cbee54dd720bb8aaa3a2111fcec667ca5f700510/build + ghclone https://github.com/HR/Picturesque/tree/master/app/src -t li50d67757gm20556d53f08126215725a698560b + +Options: + -h --help Show this screen. + -v --version Show version. + -t --token= Set a GitHub OAuth token (see https://developer.github.com/v3/#rate-limiting). +``` +# License +Copyright (C) 2019-2021 Habib Rehman (https://git.io/HR) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + diff --git a/clone.py b/clone.py deleted file mode 100644 index b86332f..0000000 --- a/clone.py +++ /dev/null @@ -1,94 +0,0 @@ -import requests -import re -import sys -import os - - -recursive = True -base_url = 'https://api.github.com' -# /repos/:owner/:repo/git/trees/:sha?recursive=:bool -tree_endpoint = base_url + '/repos/{}/{}/git/trees/{}?recursive={}' -contents_endpoint = base_url + '/repos/{}/{}/contents' -commits_endpoint = base_url + '/repos/{}/{}/commits' -base_normalize_regex = re.compile(r'.*github\.com\/') - - -def exit_with_m(m='An error occured'): - print m - sys.exit() - - -def joinp(*args): - '/'.join(args) - - -def mkdir_p(path): - try: - os.makedirs(path) - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST and os.path.isdir(path): - pass - else: - raise - - -def fetch_file(req_url, file_path): - r = requests.get(req_url, stream=True) - try: - r.raise_for_status() - except Exception as e: - exit_with_m('Failed fetching ' + req_url, e) - - with open(file_path, 'wb') as fd: - for chunk in req.iter_content(chunk_size=128): - fd.write(chunk) - - -def fetch(base_url, path=None): - """ - Recursively fetch the repo metadata - """ - req_url = base_url if not path else joinp(base_url, path) - # Request - r = requests.get(req_url) - - try: - r.raise_for_status() - except Exception as e: - exit_with_m('Failed fetching repo metdata: ', e) - - repo_data = r.json() - - if isinstance(repo_data, list): - # Recursively fetch content - for item in repo_data: - if item['type'] == 'dir': - # create dir and then fetch recursively - print 'Walking dir: %s' % item['path'] - path = joinp(path, item['path']) - fetch(joinp(base_url, path)) - else: - # download it - # Ensure dir directory exists locally - mkdir_p(path) - print 'Fetching file: %s' % item['path'] - - -if len(sys.argv) > 1: - gh_url = sys.argv[1] -else: - exit_with_m('Nothing to clone :(') - -# Normalize & parse input -norm_gh_url = re.sub(base_normalize_regex, '', gh_url) -gh_url_comps = norm_gh_url.split('/') -user, repo = gh_url_comps[:2] -branch = gh_url_comps[3] -path = joinp(gh_url_comps[4:]) - - -api_req_url = contents_endpoint.format(user, repo) - -print "Fetching sub repo %s..." % (api_req_url) - -fetch(api_req_url, path) diff --git a/ghclone/__init__.py b/ghclone/__init__.py new file mode 100644 index 0000000..fb4b168 --- /dev/null +++ b/ghclone/__init__.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 +""" +GitHub clone (git.io/ghclone) + +Usage: + ghclone [-t | --token=] + ghclone (-h | --help) + ghclone (-v | --version) + +Examples: + ghclone https://github.com/HR/Crypter/tree/master/app + ghclone https://github.com/HR/Crypter/tree/dev/app + ghclone https://github.com/HR/Crypter/tree/v3.1.0/build + ghclone https://github.com/HR/Crypter/tree/cbee54dd720bb8aaa3a2111fcec667ca5f700510/build + ghclone https://github.com/HR/Picturesque/tree/master/app/src -t li50d67757gm20556d53f08126215725a698560b + +Options: + -h --help Show this screen. + -v --version Show version. + -t --token= Set a GitHub OAuth token (see https://developer.github.com/v3/#rate-limiting). + +(C) 2019-2021 Habib Rehman (git.io/HR) +""" +import requests +import re +import os +import errno +from sys import exit +from docopt import docopt + +__version__ = '1.2.0' +GH_API_BASE_URL = 'https://api.github.com' +GH_REPO_CONTENTS_ENDPOINT = GH_API_BASE_URL + '/repos/{}/{}/contents' +BASE_NORMALIZE_REGEX = re.compile(r'.*github\.com\/') + +req = requests.Session() +req.headers.update({'User-Agent': 'git.io/ghclone ' + __version__}) + + +def exit_with_m(m='An error occured'): + print(m) + exit(1) + + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as err: # Python >2.5 + if err.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + + +def clone_file(download_url, file_path): + """ + Clones the file at the download_url to the file_path + """ + r = req.get(download_url, stream=True) + try: + r.raise_for_status() + except Exception as e: + exit_with_m('Failed to clone ' + download_url) + + with open(file_path, 'wb') as fd: + for chunk in r.iter_content(chunk_size=128): + fd.write(chunk) + + +def clone(base_url, rel_url=None, path=None, ref=None): + """ + Recursively clones the path + """ + req_url = base_url + '/' + rel_url if rel_url else base_url + + # Get path metadata + r = req.get(req_url) if not ref else req.get(req_url, params={'ref': ref}) + try: + r.raise_for_status() + except Exception as e: + exit_with_m('Failed to fetch metadata for ' + path) + repo_data = r.json() + + # Recursively clone content + for item in repo_data: + if item['type'] == 'dir': + # Fetch dir recursively + clone(base_url, item['path'], path, ref) + else: + # Fetch the file + new_file_path = resolve_path(item['path'], path) + new_path = os.path.dirname(new_file_path) + # Create path locally + mkdir_p(new_path) + # Download the file + clone_file(item['download_url'], new_file_path) + # print('Cloned', item['path']) + + +def resolve_path(path, dir): + index = path.find(dir) + if index is -1: + return os.path.abspath(os.path.join(dir, path)) + else: + return os.path.abspath(path[index:]) + + +### +# Main +### +def main(): + arguments = docopt(__doc__) + if arguments['--version']: + print(__version__) + exit(0) + + # Get params + gh_url = arguments[''] + token = arguments['--token'] + if token: + req.headers.update({'Authorization': 'token ' + token[0]}) + # Normalize & parse input + normal_gh_url = re.sub(BASE_NORMALIZE_REGEX, '', gh_url) + gh_args = normal_gh_url.replace('/tree', '').split('/') + + if len(gh_args) < 2 or normal_gh_url == gh_url: + exit_with_m('Invalid GitHub URI') + + user, repo = gh_args[:2] + ref = None + rel_url = None + + if len(gh_args) >= 2: + # Clone entire repo + path = repo + + if len(gh_args) >= 3: + # Clone entire repo at the branch + ref = gh_args[2] + + if len(gh_args) >= 4: + # Clone subdirectory + rel_url = os.path.join(*gh_args[3:]) + path = gh_args[-1] + + api_req_url = GH_REPO_CONTENTS_ENDPOINT.format(user, repo) + + print("Cloning into '%s'..." % path) + clone(api_req_url, rel_url, path, ref) + print("done.") diff --git a/ghclone/__main__.py b/ghclone/__main__.py new file mode 100644 index 0000000..1ddec88 --- /dev/null +++ b/ghclone/__main__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +from . import main + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5a3c46 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2a5131e --- /dev/null +++ b/setup.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import re +from setuptools import find_packages, setup + + +version_regex = r'__version__ = ["\']([^"\']*)["\']' +with open('ghclone/__init__.py',) as f: + text = f.read() + match = re.search(version_regex, text) + + if match: + version = match.group(1) + else: + raise RuntimeError('No version number found!') + + +with open('README.md', 'r', encoding='utf-8') as fh: + long_description = fh.read() + + +setup( + name='github-clone', + packages=find_packages(), + version=version, + description='Clone any subdirectory of a GitHub repo with just the GitHub URL', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://github.com/HR/github-clone', + author='Habib Rehman', + author_email='Hi@HabibRehman.com', + license='Apache 2.0', + project_urls={ + "Bug Tracker": "https://github.com/HR/github-clone/issues", + }, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 3', + 'Operating System :: OS Independent' + ], + python_requires='>=3.6', + install_requires=[ + 'requests>=2.20.0', + 'docopt>=0.6.2', + ], + entry_points={ + 'console_scripts': [ + 'ghclone=ghclone:main', + ], + }, +) \ No newline at end of file