Changed from .csv to .gzip in parquet

poloclub · Jun 4, 2021 · a31c719 · a31c719
1 parent 4e8cda8
commit a31c719
Show file tree

Hide file tree

Showing 14 changed files with 8,306 additions and 20 deletions.
diff --git a/Preprocessing/NewGeneratePeopleMapFilesCited.py b/Preprocessing/NewGeneratePeopleMapFilesCited.py
@@ -23,6 +23,8 @@
 # Other libraries
 import re
 import math
+import pyarrow
+import fastparquet
 
 
 
@@ -651,7 +653,8 @@ def changeRecoloring(total_clusters, maxNumberOfClusters, maxNumberOfKeywords):
 def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmphasis):
 
  # Load the CSV file
- df = pd.read_csv(givenCSV)
+ #df = pd.read_csv(givenCSV)
+ df = pd.read_parquet(givenCSV)
  df = cleanCSV(df)
  print("Completed cleaning CSV")
 
@@ -702,10 +705,10 @@ def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmph
 maxKeywordsEmphasis = 5
 
 specifiedCitedName = "cited"
-mostCitedCSV = "citedScholarDataset.csv"
+mostCitedCSV = "citedScholarDataset.gzip"
 
 specifiedRecentName = "recent"
-mostRecentCSV = "recentScholarDataset.csv"
+mostRecentCSV = "recentScholarDataset.gzip"
 
 print("Started")
 

diff --git a/Preprocessing/NewGeneratePeopleMapFilesRecent.py b/Preprocessing/NewGeneratePeopleMapFilesRecent.py
@@ -23,7 +23,8 @@
 # Other libraries
 import re
 import math
-
+import pyarrow
+import fastparquet
 
 
 
@@ -651,7 +652,8 @@ def changeRecoloring(total_clusters, maxNumberOfClusters, maxNumberOfKeywords):
 def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmphasis):
 
  # Load the CSV file
- df = pd.read_csv(givenCSV)
+ #df = pd.read_csv(givenCSV)
+ df = pd.read_parquet(givenCSV)
  df = cleanCSV(df)
  print("Completed cleaning CSV")
 
@@ -702,10 +704,10 @@ def generatePeopleMapFiles(givenCSV, specifiedName, maxClusters, maxKeywordsEmph
 maxKeywordsEmphasis = 5
 
 specifiedCitedName = "cited"
-mostCitedCSV = "citedScholarDataset.csv"
+mostCitedCSV = "citedScholarDataset.gzip"
 
 specifiedRecentName = "recent"
-mostRecentCSV = "recentScholarDataset.csv"
+mostRecentCSV = "recentScholarDataset.gzip"
 
 print("Started")
 

diff --git a/Preprocessing/NewGoogleQueryCited.py b/Preprocessing/NewGoogleQueryCited.py
@@ -2,6 +2,8 @@
 import numpy
 import re
 import pandas as pd
+import pyarrow
+import fastparquet
 
 regex = re.compile('[^a-zA-Z]')
 
@@ -25,8 +27,8 @@ def clean_abstract(ab):
 def newGenerateCitedGoogleScholarCSV(list_of_researchers):
  grid = []
 
- titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
- grid.append(titles_array)
+ #titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
+ #grid.append(titles_array)
 
  outputGrid = []
  output_columns = ['Name', 'URL', 'SuccessfullyScraped']
@@ -168,8 +170,8 @@ def newGenerateCitedGoogleScholarCSV(list_of_researchers):
 def newGenerateRecentGoogleScholarCSV(list_of_researchers):
  grid = []
 
- titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
- grid.append(titles_array)
+ #titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
+ #grid.append(titles_array)
 
  outputGrid = []
  output_columns = ['Name', 'URL', 'SuccessfullyScraped']
@@ -354,13 +356,24 @@ def newGenerateRecentGoogleScholarCSV(list_of_researchers):
 
 cited, results = newGenerateCitedGoogleScholarCSV(df)
 
-citedArray = numpy.array(cited)
-numpy.savetxt('citedScholarDataset.csv', citedArray, delimiter=',', fmt='%s', encoding ='utf8')
+titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
+cited = pd.DataFrame(cited, columns=titles_array) 
+print(cited.columns)
+print(cited.head())
+
+cited.to_parquet('citedScholarDataset.gzip')
+
+#citedArray = numpy.array(cited)
+#numpy.savetxt('citedScholarDataset.csv', citedArray, delimiter=',', fmt='%s', encoding ='utf8')
 
 resultsArray = numpy.array(results)
 numpy.savetxt('ResearchersScrapedCited.csv', resultsArray, delimiter=',', fmt='%s', encoding ='utf8')
 
 
+############################################################
+
+
+
 
 
 #recent, recentResults = newGenerateRecentGoogleScholarCSV(df)

diff --git a/Preprocessing/NewGoogleQueryRecent.py b/Preprocessing/NewGoogleQueryRecent.py
@@ -2,6 +2,8 @@
 import numpy
 import re
 import pandas as pd
+import pyarrow
+import fastparquet
 
 regex = re.compile('[^a-zA-Z]')
 
@@ -25,8 +27,8 @@ def clean_abstract(ab):
 def newGenerateCitedGoogleScholarCSV(list_of_researchers):
  grid = []
 
- titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
- grid.append(titles_array)
+ #titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
+ #grid.append(titles_array)
 
  outputGrid = []
  output_columns = ['Name', 'URL', 'SuccessfullyScraped']
@@ -168,8 +170,8 @@ def newGenerateCitedGoogleScholarCSV(list_of_researchers):
 def newGenerateRecentGoogleScholarCSV(list_of_researchers):
  grid = []
 
- titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
- grid.append(titles_array)
+ #titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
+ #grid.append(titles_array)
 
  outputGrid = []
  output_columns = ['Name', 'URL', 'SuccessfullyScraped']
@@ -365,8 +367,15 @@ def newGenerateRecentGoogleScholarCSV(list_of_researchers):
 
 recent, recentResults = newGenerateRecentGoogleScholarCSV(df)
 
-recentArray = numpy.array(recent)
-numpy.savetxt('recentScholarDataset.csv', recentArray, delimiter=',', fmt='%s', encoding ='utf8')
+titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
+recent = pd.DataFrame(recent, columns=titles_array) 
+print(recent.columns)
+print(recent.head())
+
+recent.to_parquet('recentScholarDataset.gzip')
+
+#recentArray = numpy.array(recent)
+#numpy.savetxt('recentScholarDataset.csv', recentArray, delimiter=',', fmt='%s', encoding ='utf8')
 
 recentResultsArray = numpy.array(recentResults)
 numpy.savetxt('ResearchersScrapedRecent.csv', recentResultsArray, delimiter=',', fmt='%s', encoding ='utf8')

diff --git a/ResearchersDataset.csv b/ResearchersDataset.csv
@@ -0,0 +1,11 @@
+URL
+https://scholar.google.com/citations?user=B3U7yvcAAAAJ&hl=it
+https://scholar.google.com/citations?user=JnkeH28AAAAJ&hl=it
+https://scholar.google.co.uk/citations?user=sbcJOSoAAAAJ&hl=en
+https://scholar.google.co.uk/citations?user=JXTpQXIAAAAJ&hl=en
+https://scholar.google.com.mx/citations?user=SPrHADoAAAAJ&hl=en
+https://scholar.google.co.uk/citations?user=o2gCskMAAAAJ&hl=en
+https://scholar.google.ch/citations?user=IoKBsjcAAAAJ&hl=en
+https://scholar.google.com/citations?user=uE1oO1gAAAAJ&hl=en
+https://scholar.google.com/citations?user=vNvRYksAAAAJ&hl=en
+https://scholar.google.com/citations?user=wIFFAosAAAAJ&hl=en
diff --git a/ResearchersScrapedCited.csv b/ResearchersScrapedCited.csv
@@ -0,0 +1,10 @@
+Guido Noto La Diega,https://scholar.google.com/citations?user=B3U7yvcAAAAJ&hl=it,True
+Rossana Ducato,https://scholar.google.com/citations?user=JnkeH28AAAAJ&hl=it,True
+,https://scholar.google.co.uk/citations?user=sbcJOSoAAAAJ&hl=en,False
+,https://scholar.google.co.uk/citations?user=JXTpQXIAAAAJ&hl=en,False
+,https://scholar.google.com.mx/citations?user=SPrHADoAAAAJ&hl=en,False
+,https://scholar.google.co.uk/citations?user=o2gCskMAAAAJ&hl=en,False
+,https://scholar.google.ch/citations?user=IoKBsjcAAAAJ&hl=en,False
+Jade Kouletakis,https://scholar.google.com/citations?user=uE1oO1gAAAAJ&hl=en,True
+Irene Couzigou, PhD, LLM,https://scholar.google.com/citations?user=vNvRYksAAAAJ&hl=en,True
+Patricia Živković,https://scholar.google.com/citations?user=wIFFAosAAAAJ&hl=en,True
diff --git a/ResearchersScrapedRecent.csv b/ResearchersScrapedRecent.csv
@@ -0,0 +1,10 @@
+Guido Noto La Diega,https://scholar.google.com/citations?user=B3U7yvcAAAAJ&hl=it,True
+Rossana Ducato,https://scholar.google.com/citations?user=JnkeH28AAAAJ&hl=it,True
+,https://scholar.google.co.uk/citations?user=sbcJOSoAAAAJ&hl=en,False
+,https://scholar.google.co.uk/citations?user=JXTpQXIAAAAJ&hl=en,False
+,https://scholar.google.com.mx/citations?user=SPrHADoAAAAJ&hl=en,False
+,https://scholar.google.co.uk/citations?user=o2gCskMAAAAJ&hl=en,False
+,https://scholar.google.ch/citations?user=IoKBsjcAAAAJ&hl=en,False
+Jade Kouletakis,https://scholar.google.com/citations?user=uE1oO1gAAAAJ&hl=en,True
+Irene Couzigou, PhD, LLM,https://scholar.google.com/citations?user=vNvRYksAAAAJ&hl=en,True
+Patricia Živković,https://scholar.google.com/citations?user=wIFFAosAAAAJ&hl=en,True