Skip to content

Commit

Permalink
Changed from .csv to .gzip in parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Saad-Falcon committed Jun 4, 2021
1 parent 4e8cda8 commit a31c719
Show file tree
Hide file tree
Showing 14 changed files with 8,306 additions and 20 deletions.
9 changes: 6 additions & 3 deletions Preprocessing/NewGeneratePeopleMapFilesCited.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
# Other libraries
import re
import math
import pyarrow
import fastparquet



Expand Down Expand Up @@ -651,7 +653,8 @@ def changeRecoloring(total_clusters, maxNumberOfClusters, maxNumberOfKeywords):
def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmphasis):

# Load the CSV file
df = pd.read_csv(givenCSV)
#df = pd.read_csv(givenCSV)
df = pd.read_parquet(givenCSV)
df = cleanCSV(df)
print("Completed cleaning CSV")

Expand Down Expand Up @@ -702,10 +705,10 @@ def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmph
maxKeywordsEmphasis = 5

specifiedCitedName = "cited"
mostCitedCSV = "citedScholarDataset.csv"
mostCitedCSV = "citedScholarDataset.gzip"

specifiedRecentName = "recent"
mostRecentCSV = "recentScholarDataset.csv"
mostRecentCSV = "recentScholarDataset.gzip"

print("Started")

Expand Down
10 changes: 6 additions & 4 deletions Preprocessing/NewGeneratePeopleMapFilesRecent.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
# Other libraries
import re
import math

import pyarrow
import fastparquet



Expand Down Expand Up @@ -651,7 +652,8 @@ def changeRecoloring(total_clusters, maxNumberOfClusters, maxNumberOfKeywords):
def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmphasis):

# Load the CSV file
df = pd.read_csv(givenCSV)
#df = pd.read_csv(givenCSV)
df = pd.read_parquet(givenCSV)
df = cleanCSV(df)
print("Completed cleaning CSV")

Expand Down Expand Up @@ -702,10 +704,10 @@ def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmph
maxKeywordsEmphasis = 5

specifiedCitedName = "cited"
mostCitedCSV = "citedScholarDataset.csv"
mostCitedCSV = "citedScholarDataset.gzip"

specifiedRecentName = "recent"
mostRecentCSV = "recentScholarDataset.csv"
mostRecentCSV = "recentScholarDataset.gzip"

print("Started")

Expand Down
25 changes: 19 additions & 6 deletions Preprocessing/NewGoogleQueryCited.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import numpy
import re
import pandas as pd
import pyarrow
import fastparquet

regex = re.compile('[^a-zA-Z]')

Expand All @@ -25,8 +27,8 @@ def clean_abstract(ab):
def newGenerateCitedGoogleScholarCSV(list_of_researchers):
grid = []

titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
grid.append(titles_array)
#titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
#grid.append(titles_array)

outputGrid = []
output_columns = ['Name', 'URL', 'SuccessfullyScraped']
Expand Down Expand Up @@ -168,8 +170,8 @@ def newGenerateCitedGoogleScholarCSV(list_of_researchers):
def newGenerateRecentGoogleScholarCSV(list_of_researchers):
grid = []

titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
grid.append(titles_array)
#titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
#grid.append(titles_array)

outputGrid = []
output_columns = ['Name', 'URL', 'SuccessfullyScraped']
Expand Down Expand Up @@ -354,13 +356,24 @@ def newGenerateRecentGoogleScholarCSV(list_of_researchers):

cited, results = newGenerateCitedGoogleScholarCSV(df)

citedArray = numpy.array(cited)
numpy.savetxt('citedScholarDataset.csv', citedArray, delimiter=',', fmt='%s', encoding ='utf8')
titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
cited = pd.DataFrame(cited, columns=titles_array)
print(cited.columns)
print(cited.head())

cited.to_parquet('citedScholarDataset.gzip')

#citedArray = numpy.array(cited)
#numpy.savetxt('citedScholarDataset.csv', citedArray, delimiter=',', fmt='%s', encoding ='utf8')

resultsArray = numpy.array(results)
numpy.savetxt('ResearchersScrapedCited.csv', resultsArray, delimiter=',', fmt='%s', encoding ='utf8')


############################################################





#recent, recentResults = newGenerateRecentGoogleScholarCSV(df)
Expand Down
21 changes: 15 additions & 6 deletions Preprocessing/NewGoogleQueryRecent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import numpy
import re
import pandas as pd
import pyarrow
import fastparquet

regex = re.compile('[^a-zA-Z]')

Expand All @@ -25,8 +27,8 @@ def clean_abstract(ab):
def newGenerateCitedGoogleScholarCSV(list_of_researchers):
grid = []

titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
grid.append(titles_array)
#titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
#grid.append(titles_array)

outputGrid = []
output_columns = ['Name', 'URL', 'SuccessfullyScraped']
Expand Down Expand Up @@ -168,8 +170,8 @@ def newGenerateCitedGoogleScholarCSV(list_of_researchers):
def newGenerateRecentGoogleScholarCSV(list_of_researchers):
grid = []

titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
grid.append(titles_array)
#titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
#grid.append(titles_array)

outputGrid = []
output_columns = ['Name', 'URL', 'SuccessfullyScraped']
Expand Down Expand Up @@ -365,8 +367,15 @@ def newGenerateRecentGoogleScholarCSV(list_of_researchers):

recent, recentResults = newGenerateRecentGoogleScholarCSV(df)

recentArray = numpy.array(recent)
numpy.savetxt('recentScholarDataset.csv', recentArray, delimiter=',', fmt='%s', encoding ='utf8')
titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
recent = pd.DataFrame(recent, columns=titles_array)
print(recent.columns)
print(recent.head())

recent.to_parquet('recentScholarDataset.gzip')

#recentArray = numpy.array(recent)
#numpy.savetxt('recentScholarDataset.csv', recentArray, delimiter=',', fmt='%s', encoding ='utf8')

recentResultsArray = numpy.array(recentResults)
numpy.savetxt('ResearchersScrapedRecent.csv', recentResultsArray, delimiter=',', fmt='%s', encoding ='utf8')
Expand Down
11 changes: 11 additions & 0 deletions ResearchersDataset.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
URL
https://scholar.google.com/citations?user=B3U7yvcAAAAJ&hl=it
https://scholar.google.com/citations?user=JnkeH28AAAAJ&hl=it
https://scholar.google.co.uk/citations?user=sbcJOSoAAAAJ&hl=en
https://scholar.google.co.uk/citations?user=JXTpQXIAAAAJ&hl=en
https://scholar.google.com.mx/citations?user=SPrHADoAAAAJ&hl=en
https://scholar.google.co.uk/citations?user=o2gCskMAAAAJ&hl=en
https://scholar.google.ch/citations?user=IoKBsjcAAAAJ&hl=en
https://scholar.google.com/citations?user=uE1oO1gAAAAJ&hl=en
https://scholar.google.com/citations?user=vNvRYksAAAAJ&hl=en
https://scholar.google.com/citations?user=wIFFAosAAAAJ&hl=en
10 changes: 10 additions & 0 deletions ResearchersScrapedCited.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Guido Noto La Diega,https://scholar.google.com/citations?user=B3U7yvcAAAAJ&hl=it,True
Rossana Ducato,https://scholar.google.com/citations?user=JnkeH28AAAAJ&hl=it,True
,https://scholar.google.co.uk/citations?user=sbcJOSoAAAAJ&hl=en,False
,https://scholar.google.co.uk/citations?user=JXTpQXIAAAAJ&hl=en,False
,https://scholar.google.com.mx/citations?user=SPrHADoAAAAJ&hl=en,False
,https://scholar.google.co.uk/citations?user=o2gCskMAAAAJ&hl=en,False
,https://scholar.google.ch/citations?user=IoKBsjcAAAAJ&hl=en,False
Jade Kouletakis,https://scholar.google.com/citations?user=uE1oO1gAAAAJ&hl=en,True
Irene Couzigou, PhD, LLM,https://scholar.google.com/citations?user=vNvRYksAAAAJ&hl=en,True
Patricia Živković,https://scholar.google.com/citations?user=wIFFAosAAAAJ&hl=en,True
10 changes: 10 additions & 0 deletions ResearchersScrapedRecent.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Guido Noto La Diega,https://scholar.google.com/citations?user=B3U7yvcAAAAJ&hl=it,True
Rossana Ducato,https://scholar.google.com/citations?user=JnkeH28AAAAJ&hl=it,True
,https://scholar.google.co.uk/citations?user=sbcJOSoAAAAJ&hl=en,False
,https://scholar.google.co.uk/citations?user=JXTpQXIAAAAJ&hl=en,False
,https://scholar.google.com.mx/citations?user=SPrHADoAAAAJ&hl=en,False
,https://scholar.google.co.uk/citations?user=o2gCskMAAAAJ&hl=en,False
,https://scholar.google.ch/citations?user=IoKBsjcAAAAJ&hl=en,False
Jade Kouletakis,https://scholar.google.com/citations?user=uE1oO1gAAAAJ&hl=en,True
Irene Couzigou, PhD, LLM,https://scholar.google.com/citations?user=vNvRYksAAAAJ&hl=en,True
Patricia Živković,https://scholar.google.com/citations?user=wIFFAosAAAAJ&hl=en,True
Loading

0 comments on commit a31c719

Please sign in to comment.