Skip to content

Commit

Permalink
first version of dataset added
Browse files Browse the repository at this point in the history
  • Loading branch information
simitii authored and urasmutlu committed Aug 6, 2019
1 parent 7c135d5 commit 9bce2be
Show file tree
Hide file tree
Showing 852 changed files with 544,477 additions and 46 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
compressed_paper_folders/
paper_folders/

.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
27 changes: 15 additions & 12 deletions dataset_generation/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@
if not os.path.exists(EXTRACT_FOLDER):
os.mkdir(EXTRACT_FOLDER)

for i in range(nb_papers):
for i in range(409, nb_papers, 1):
paper_link = lines[2*i+1]
paper_code = paper_link.split("/")[-1]
paper_source_link = "https://arxiv.org/e-print/" + paper_code

# Download the file from `paper_source_link` and save it locally under `DOWNLOAD_FOLDER+str(i)+".tar.gz"`:
compressed_file_path = DOWNLOAD_FOLDER+str(i)+".tar.gz"
with urllib.request.urlopen(paper_source_link) as response, open(compressed_file_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)

# Extract from tar the tar file
tar = tarfile.open(compressed_file_path)
paper_folder_dir = EXTRACT_FOLDER + str(i) + "/"
tar.extractall(path=paper_folder_dir)
tar.close()
try:
# Download the file from `paper_source_link` and save it locally under `DOWNLOAD_FOLDER+str(i)+".tar.gz"`:
compressed_file_path = DOWNLOAD_FOLDER+str(i)+".tar.gz"
with urllib.request.urlopen(paper_source_link) as response, open(compressed_file_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)

# Extract from tar the tar file
tar = tarfile.open(compressed_file_path)
paper_folder_dir = EXTRACT_FOLDER + str(i) + "/"
tar.extractall(path=paper_folder_dir)
tar.close()
except Exception:
print("error at paper %g" % (i))
print("progress: %g / %g" % (i,nb_papers), end="\r")
65 changes: 37 additions & 28 deletions dataset_generation/latex_input_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,64 +23,73 @@ def import_resolve(tex, path):
dir_path = os.path.dirname(path) + "/"

for _input in soup.find_all('input'):
#print("input statement detected")
path = os.path.join(dir_path, _input.args[0])
if not os.path.exists(path):
path = path + ".tex"
print("Resolved Path:", path)
#print("Resolved Path:", path)
_input.replace(*import_resolve(open(path), dir_path).contents)

# CHECK FOLLOWING ONES
# resolve subimports
for subimport in soup.find_all('subimport'):
#print("subimport statement detected")
path = os.path.join(dir_path, subimport.args[0] + subimport.args[1])
if not os.path.exists(path):
path = path + ".tex"
print("Resolved Path:", path)
subimport.replace(*import_resolve(open(path)).contents)
#print("Resolved Path:", path)
subimport.replace(*import_resolve(open(path), dir_path).contents)

# resolve imports
for _import in soup.find_all('import'):
#print("import statement detected")
path = os.path.join(dir_path, _import.args[0])
if not os.path.exists(path):
path = path + ".tex"
print("Resolved Path:", path)
_import.replace(*import_resolve(open(path)).contents)
#print("Resolved Path:", path)
_import.replace(*import_resolve(open(path), dir_path).contents)

# resolve includes
for include in soup.find_all('include'):
#print("include statement detected")
path = os.path.join(dir_path, include.args[0])
if not os.path.exists(path):
path = path + ".tex"
print("Resolved Path:", path)
include.replace(*import_resolve(open(path)).contents)
#print("Resolved Path:", path)
include.replace(*import_resolve(open(path), dir_path).contents)

return soup

for i in range(1):
for i in range(401, 1000, 1):
paper_folder_dir = EXTRACT_FOLDER + str(i) + "/**/"
extension = "*.tex"
tex_files = glob.glob(paper_folder_dir + extension, recursive=True)

root_files = []

#print(tex_files)
try:
for f_path in tex_files:
with open(f_path) as f:
tex = f.read()
soup = TexSoup(tex)
if soup.documentclass is not None:
latex_object = import_resolve(tex, f_path)
root_files.append(latex_object)

print(tex_files)
for f_path in tex_files:
with open(f_path) as f:
tex = f.read()
print(tex)
soup = TexSoup(tex)
if soup.documentclass is not None:
latex_object = import_resolve(tex, f_path)
root_files.append(latex_object)
if len(root_files) < 1:
print("no root file?")
elif len(root_files) > 1:
print("writing multiple root files for paper", i)
for j in range(len(root_files)):
with open(FINAL_FOLDER + str(i) + "-" + str(j) + ".tex", "wt") as f:
f.write(str(root_files[j]))
else:
print("writing single root file for paper", i)
with open(FINAL_FOLDER + str(i) + ".tex", "wt") as f:
f.write(str(root_files[0]))

if len(root_files) < 1:
print("no root file?")
elif len(root_files) > 1:
print("writing multiple root files for paper", i)
for j in range(len(root_files)):
with open(FINAL_FOLDER + str(i) + "-" + str(j) + ".tex", "wt") as f:
f.write(str(root_files[j]))
else:
print("writing single root file for paper", i)
with open(FINAL_FOLDER + str(i) + ".tex", "wt") as f:
f.write(str(root_files[0]))
except Exception:
print("error at paper %g" % (i))

print("progress: %g / %g" % (i,nb_papers))
Loading

0 comments on commit 9bce2be

Please sign in to comment.