Skip to content

Commit

Permalink
Done it all.
Browse files Browse the repository at this point in the history
Now, time for the documentation.
  • Loading branch information
Fabbro96 committed Apr 20, 2023
1 parent ca19e9f commit 8d4c202
Showing 1 changed file with 98 additions and 67 deletions.
165 changes: 98 additions & 67 deletions bigData_project.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -36,7 +36,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -72,7 +72,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -99,7 +99,7 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 12,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -138,7 +138,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -260,7 +260,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -325,7 +325,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -346,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -438,12 +438,36 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 17,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"23/04/20 18:28:33 WARN Utils: Your hostname, kerah resolves to a loopback address: 127.0.1.1; using 192.168.178.79 instead (on interface wlp38s0)\n",
"23/04/20 18:28:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"23/04/20 18:28:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
}
],
"source": [
"%%capture\n",
"spark = SparkSession.builder.appName(\"mySparkApp\")#.config(\"spark.driver.memory\", \"4g\").config(\"spark.driver.maxResultSize\", \"12g\").getOrCreate()"
"spark = SparkSession.builder.appName(\"mySparkApp\").config(\"spark.driver.memory\", \"4g\").config(\"spark.driver.maxResultSize\", \"8g\").getOrCreate()"
]
},
{
Expand All @@ -460,7 +484,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand All @@ -474,7 +498,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"23/04/20 17:08:18 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors\n"
"23/04/20 18:28:45 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors\n"
]
},
{
Expand Down Expand Up @@ -516,7 +540,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [
{
Expand All @@ -530,31 +554,46 @@
"name": "stdout",
"output_type": "stream",
"text": [
"23/04/20 17:08:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 17:08:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n"
"23/04/20 18:28:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:28:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
"23/04/20 18:29:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Stage 14:> (0 + 5) / 5]\r"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[61,231s][warning][gc,alloc] Executor task launch worker for task 4.0 in stage 14.0 (TID 62): Retried waiting for GCLocker too often allocating 33554434 words\n",
"23/04/20 18:29:33 WARN TaskMemoryManager: Failed to allocate a page (268435456 bytes), try again.\n"
]
},
{
Expand Down Expand Up @@ -589,7 +628,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -621,7 +660,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"metadata": {},
"outputs": [
{
Expand All @@ -636,7 +675,7 @@
"%%capture\n",
"df = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"countedWords.csv\")\n",
"\n",
"# raggruppa le righe in base alla colonna \"word\" e somma la colonna \"count\"\n",
"# raggruppo le righe in base alla colonna \"word\" e somma la colonna \"count\"\n",
"sum_df = df.groupBy(\"word\").agg(sum(\"count\").alias(\"total_count\"))\n",
"sum_df = sum_df.withColumn(\"total_count\", sum_df[\"total_count\"].cast(\"int\"))\n",
"\n",
Expand Down Expand Up @@ -666,7 +705,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -689,7 +728,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -707,8 +746,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"> <b>L’output deve contenere, per ogni parola, la lista di pagine di wikipedia\n",
"che contengono quella parola</b>"
"> <b>L’OUTPUT DEVE CONTENERE, PER OGNI PAROLA, LA LISTA DI PAGINE DI WIKIPEDIA CHE CONTENGONO QUELLA PAROLA</b>"
]
},
{
Expand All @@ -727,7 +765,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -768,7 +806,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -795,7 +833,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -826,7 +864,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -863,7 +901,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 28,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -905,7 +943,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 29,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -954,7 +992,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 30,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -993,7 +1031,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 31,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1050,7 +1088,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 32,
"metadata": {},
"outputs": [
{
Expand All @@ -1062,19 +1100,12 @@
}
],
"source": [
"choise = input(\"Vuoi rimuovere tutti i file creati? [y/n]\").lower()\n",
"choise = input(\"Vuoi rimuovere tutti i file con estensione CSV creati? [y/n]\").lower()\n",
"while True: \n",
" if choise == \"y\" or choise.lower == \"yes\":\n",
" os.remove(\"occurrences.csv\")\n",
" os.remove(\"titlesSize.csv\")\n",
" os.remove(\"singleWordCounted.csv\")\n",
" os.remove(\"results.csv\")\n",
" os.remove(\"fileWords.csv\")\n",
" os.remove(\"fileTitles.csv\")\n",
" os.remove(\"rawMerged.csv\")\n",
" os.remove('countedWords.csv')\n",
" os.remove(\"finalFileToAnalyze.csv\")\n",
" os.remove(\"file.csv\")\n",
" if choise == \"y\" or choise == \"yes\":\n",
" for x in os.listdir():\n",
" if x.endswith(\".csv\"):\n",
" os.remove(x)\n",
" print(\"File rimossi con successo!\")\n",
" break\n",
" if choise == \"n\" or choise == \"no\":\n",
Expand Down

0 comments on commit 8d4c202

Please sign in to comment.