Skip to content

Commit d4523eb

Browse files
authored
Merge pull request larymak#74 from lucasgit13/main
Addition of script that download subdirectories from a github repository
2 parents 3c25885 + 8b083d4 commit d4523eb

File tree

4 files changed

+372
-0
lines changed

4 files changed

+372
-0
lines changed

Get-Dir-Github-Repo/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
__pycache__/
2+
test/

Get-Dir-Github-Repo/README.md

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Have you ever thought how do I download just a subdirectory from a Github repository? Yes? So this is the solution!
2+
3+
*Get* (I should take a better name..) is a *"multithreaded"* python script for dealing with a common problem that sometimes I pass through, get just some files from a repo whithou having to clone the whole repo.
4+
5+
## Installation
6+
7+
1. Download [get.py](https://raw.githubusercontent.com/larymak/Python-project-Scripts/main/Get-Dir-Github-Repo/get.py).
8+
9+
## Requirements
10+
The script will check if the required modules are installed, if not it will try install them. If it fails, you will have to manually install them. Get.py for now only have one module that not comes with python by default, *Requests*. *__Make sure you have python 3 proprely installed on your system.__*
11+
12+
Download [requirements.txt](https://raw.githubusercontent.com/larymak/Python-project-Scripts/main/Get-Dir-Github-Repo/requirements.txt) and run:
13+
14+
```
15+
python3 -m pip install -r requirements.txt
16+
```
17+
18+
## Usage
19+
```cmd
20+
python3 get.py [URL] [OPTIONAL ARGS]
21+
```
22+
Let's say you want get some files from a repo: *https://github.com/user/repo*.
23+
```
24+
repo/
25+
test/
26+
build/
27+
src/
28+
file1.py
29+
file2.py
30+
file3.py
31+
file4.py
32+
file5.py
33+
file6.py
34+
file.json
35+
file.yaml
36+
README.md
37+
.gitiginore
38+
```
39+
When providing a valid and public github repository, the script will get the files that list on the current directory get from the url, all subdirectories will be ignored.
40+
41+
```cmd
42+
python3 get.py https://github.com/user/repo
43+
```
44+
A directory with the name of the repo will be create on working directory on your file system:
45+
```
46+
repo/
47+
file1.py
48+
file2.py
49+
file3.py
50+
file4.py
51+
file5.py
52+
file6.py
53+
file.json
54+
file.yaml
55+
README.md
56+
.gitiginore
57+
```
58+
### If I want filter the files?
59+
No problem, you can use the flags *--include-only or -I and --exclude or -E* for filter the files you want and don't want with glob search pattern.
60+
61+
```cmd
62+
python3 get.py https://github.com/user/repo -I *.py
63+
```
64+
```cmd
65+
python3 get.py https://github.com/user/repo -E *.md .*
66+
```
67+
#### For more information run:
68+
```cmd
69+
python3 get.py --help
70+
```
71+

Get-Dir-Github-Repo/get.py

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
import argparse
2+
import concurrent.futures
3+
import fnmatch
4+
import sys
5+
import os
6+
import subprocess
7+
from itertools import product
8+
9+
__version__ = "1.1"
10+
11+
12+
# This will attempt to import the modules required for the script run
13+
# if fail to import it will try to install
14+
modules = ["requests"]
15+
16+
try:
17+
import requests
18+
except:
19+
print("Attempting to install the requirements...")
20+
21+
try:
22+
for module in modules:
23+
subprocess.run(
24+
["python", "-m", "pip", "install", module],
25+
stdout=subprocess.DEVNULL,
26+
stderr=subprocess.DEVNULL,
27+
)
28+
import requests
29+
30+
print("Requirements was successful installed!")
31+
except:
32+
try:
33+
for module in modules:
34+
subprocess.run(
35+
["python3", "-m", "pip", "install", module],
36+
stdout=subprocess.DEVNULL,
37+
stderr=subprocess.DEVNULL,
38+
)
39+
import requests
40+
41+
print("Requirements was successful installed!")
42+
except:
43+
sys.exit("Could not install requirements :(")
44+
45+
46+
### Comandline arguments ###
47+
parser = argparse.ArgumentParser(
48+
description="Single Github repository directory downloader.",
49+
usage="%(prog)s [<optional arguments>] <url> [<destination>]",
50+
)
51+
parser.add_argument(
52+
"url",
53+
nargs=1,
54+
help="Github repository url, example: https://github.com/[<owner>]/[<repo>]",
55+
)
56+
parser.add_argument(
57+
"-V", "--version", action="version", version=f"%(prog)s {__version__}"
58+
)
59+
parser.add_argument(
60+
"-v",
61+
"--verbose",
62+
action="store_true",
63+
help="Print each file of the repository while clonnig",
64+
)
65+
parser.add_argument(
66+
"-I",
67+
"--include-only",
68+
dest="include",
69+
nargs=1,
70+
help="Include only the files that match the given glob pattern.",
71+
)
72+
parser.add_argument(
73+
"-E", "--exclude", nargs=1, help="Exclude files that match the given glob pattern."
74+
)
75+
parser.add_argument(
76+
"output",
77+
nargs="?",
78+
default=None,
79+
help="Name of the directory to clone into. (Default is branch name)",
80+
)
81+
82+
if len(sys.argv) == 1:
83+
parser.print_help()
84+
85+
args = parser.parse_args()
86+
87+
88+
### Functions ###
89+
def check_url(url):
90+
if not "https://github.com/" in url:
91+
sys.exit("The url must to be a valid and public Github repository.")
92+
93+
if url[-1] == "/":
94+
url = url[:-1]
95+
96+
try:
97+
r = requests.get(url, timeout=30)
98+
except requests.ConnectionError as e:
99+
print(
100+
"OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n"
101+
)
102+
sys.exit(str(e))
103+
except requests.Timeout as e:
104+
print("OOPS!! Timeout Error")
105+
sys.exit(str(e))
106+
except requests.RequestException as e:
107+
print("OOPS!! General Error")
108+
sys.exit(str(e))
109+
except KeyboardInterrupt:
110+
sys.exit("Someone closed the program")
111+
112+
if r.status_code == 404:
113+
sys.exit(f"404 Client Error: Not Found for url: {url}")
114+
115+
116+
def Get(url):
117+
user = ""
118+
repo = ""
119+
path = ""
120+
121+
if url[-1] == "/":
122+
url = url[:-1]
123+
124+
try:
125+
sp = url.split("/")
126+
if len(sp) > 5:
127+
for _ in range(7):
128+
sp.pop(0)
129+
path = "/".join(sp)
130+
131+
user = url.split("/")[3]
132+
repo = url.split("/")[4]
133+
if path:
134+
api_url = f"https://api.github.com/repos/{user}/{repo}/contents/{path}"
135+
else:
136+
api_url = f"https://api.github.com/repos/{user}/{repo}/contents"
137+
138+
if api_url:
139+
try:
140+
r = requests.get(api_url, timeout=30)
141+
code = r.status_code
142+
143+
if code == 403:
144+
if r.headers["content-type"] == "application/json; charset=utf-8":
145+
if "message" in r.json():
146+
sys.exit("You reached requests limit, try again later!")
147+
if code == 404:
148+
sys.exit(f"error: {code}")
149+
except requests.exceptions.RequestException as e:
150+
sys.exit(f"error:\n{e}")
151+
else:
152+
sys.exit(f"error: could not extract information about repo: {url}.")
153+
except Exception as e:
154+
print(e)
155+
sys.exit(f"error: could not extract information about repo: {url}.")
156+
else:
157+
return {"api_url": api_url, "repo": repo, "path": path}
158+
159+
160+
def search_pattern(obj, pattern_list):
161+
matches = 0
162+
for token in range(0, len(obj)):
163+
f = obj[token]["name"]
164+
for p in pattern_list:
165+
if fnmatch.fnmatch(f, p):
166+
matches += 1
167+
168+
return matches
169+
170+
171+
def include(obj, pattern_list):
172+
include_list = []
173+
matches = 0
174+
175+
for index in range(0, len(obj)):
176+
f = obj[index]["name"]
177+
t = obj[index]["type"]
178+
if t != "dir":
179+
for p in pattern_list:
180+
if fnmatch.fnmatch(f, p):
181+
include_list.append(obj[index])
182+
matches += 1
183+
184+
return (include_list, matches)
185+
186+
187+
def exclude(obj, pattern_list, matches):
188+
count = 0
189+
while matches != 0:
190+
for _ in obj:
191+
l = len(obj)
192+
if count == l:
193+
count = 0
194+
195+
f = obj[count]["name"]
196+
for p in pattern_list:
197+
if fnmatch.fnmatch(f, p):
198+
# print(f'{f}, {count}')
199+
obj.pop(count)
200+
matches -= 1
201+
count += 1
202+
203+
return obj
204+
205+
206+
def fetch(obj):
207+
file = obj["name"]
208+
url = obj["download_url"]
209+
210+
content = requests.get(url).content
211+
filename = os.path.join(directory, file)
212+
f = open(filename, "bw")
213+
f.write(content)
214+
f.close()
215+
216+
if verbose:
217+
print(file)
218+
219+
220+
url = args.url[0]
221+
check_url(url)
222+
223+
verbose = args.verbose
224+
output = args.output
225+
api_url = Get(url)["api_url"]
226+
repo = Get(url)["repo"]
227+
path = Get(url)["path"]
228+
include_list = args.include
229+
exclude_list = args.exclude
230+
directory = ""
231+
232+
if include_list and exclude_list:
233+
# Check if the glob patttern given to -I and -E
234+
# was the same, if it is exit with an error
235+
globs = list(product(include_list, exclude_list))
236+
for token in range(len(globs)):
237+
i = globs[token][0]
238+
e = globs[token][1]
239+
240+
if i == e:
241+
print(f"-I and -E cannot share same glob pattern: {i}")
242+
sys.exit(0)
243+
244+
if output:
245+
directory = output
246+
else:
247+
directory = repo
248+
249+
if path:
250+
directory = os.path.join(directory, path)
251+
252+
if os.path.isdir(directory): # Check is directory exist.
253+
if any(os.scandir(directory)): # is it empty?
254+
sys.exit(f"'{directory}' already exist and is not empty.")
255+
else:
256+
try:
257+
os.makedirs(directory)
258+
except:
259+
sys.exit(f"Could not create '{directory}'.")
260+
261+
r = ""
262+
263+
try:
264+
r = requests.get(api_url, timeout=30)
265+
except requests.exceptions.RequestException:
266+
sys.exit("error: Connetion error. Aborted.")
267+
268+
try:
269+
obj = r.json()
270+
obj_len = len(obj)
271+
except:
272+
sys.exit(f"error: Could not load files on {url}")
273+
274+
275+
if include_list:
276+
print("Searching for matches...")
277+
(obj_, matches) = include(obj, include_list)
278+
279+
if matches != 0:
280+
obj = obj_
281+
print(f"{matches} matches found to include")
282+
else:
283+
sys.exit(f"no matches for {include_list}")
284+
285+
if exclude_list:
286+
matches = search_pattern(obj, exclude_list)
287+
if matches:
288+
obj_ = exclude(obj, exclude_list, matches)
289+
obj = obj_
290+
else:
291+
print(f"{matches} matches found to ignore")
292+
293+
print(f"\nClonning into {directory}...")
294+
295+
with concurrent.futures.ThreadPoolExecutor() as executor:
296+
executor.map(fetch, obj)
297+
298+
print("\nDone")

Get-Dir-Github-Repo/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
requests

0 commit comments

Comments
 (0)