-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
99 lines (73 loc) · 2.45 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
## File for data gathering using github API key
from github import Github
import os
import time
from tqdm import tqdm
g = Github(_ENV)
print(g.get_user())
query = "langchain language:python"
result = g.search_repositories(query)
no_of_repos = 1000
for i in range(no_of_repos):
print(result[i].clone_url)
print(result[i].tags_url)
os.system(f"git clone {result[i].clone_url} repos/{result[i].owner.login}/{result[i].name}")
import os
d = "repos"
for dirpath, dirnames, filenames in tqdm(os.walk(d)):
for f in filenames:
full_path = os.path.join(dirpath,f)
if full_path.endswith(".py"):
# print(f"Keeping {full_path}")
pass
else:
if d in full_path:
os.remove(full_path)
else:
print("kuch to gadbad hai")
time.sleep(60)
def get_dir_size(start_path='.'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
# skip if it is symbolic link
if not os.path.islink(fp):
total_size += os.path.getsize(fp)
return total_size
def get_subdir_count(start_path='.'):
subdir_count = 0
for dirpath, dirnames, filenames in os.walk(start_path):
subdir_count += len(dirnames)
break # we just want to count the subdirectories in the start_path, not sub-subdirectories
return subdir_count
dir_to_check = "/content/repos" # replace with your directory
print(f"Total size of '{dir_to_check}' is {get_dir_size(dir_to_check)/(1024*1024)} MB")
print(f"Number of subdirectories in '{dir_to_check}' is {get_subdir_count(dir_to_check)}")
import os
import time
MAX_CHAR_LENGTH = 512
MIN_CHAR_LENGTH = 400
NEWLINECHAR= "<N>"
full_paths = []
for dirpath, dirnames, filenames in os.walk(d):
for f in filenames:
full_path = os.path.join(dirpath,f)
full_paths.append(full_path)
with open("python_code_text_data.txt","a") as f:
for fpath in full_paths:
try:
d = open(fpath,"r").read()
fd = d.replace("\n", NEWLINECHAR)
if 100 < len(d) <= MAX_CHAR_LENGTH:
f.write(fd+'\n')
else:
sd = fd.split(f"{NEWLINECHAR}{NEWLINECHAR}")
substring = ""
for split in sd:
substring += split+f"{NEWLINECHAR}{NEWLINECHAR}"
if MIN_CHAR_LENGTH <= len(substring) <= MAX_CHAR_LENGTH:
f.write(substring+'\n')
substring = ""
except Exception as e:
print(str(e))