{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "q1.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "508rMLhuaRh4", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9d99aebd-baa3-4d29-f484-0dbdcca454ec" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading...\n", "From: https://drive.google.com/uc?id=1kTJmHH3qEyivtSJvnVUMYpWnZcQl5XaV\n", "To: /content/Bigdata_hw2_datasets.zip\n", "100% 6.28M/6.28M [00:00<00:00, 46.4MB/s]\n" ] } ], "source": [ "!pip install --upgrade --no-cache-dir gdown >/dev/null\n", "!gdown 1kTJmHH3qEyivtSJvnVUMYpWnZcQl5XaV" ] }, { "cell_type": "code", "source": [ "!unzip Bigdata_hw2_datasets.zip" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jwm4llCFbxHh", "outputId": "e5ebe3bd-aa2b-4b25-cae6-2c017032b8bc" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Archive: Bigdata_hw2_datasets.zip\n", " creating: Bigdata_hw2_datasets/\n", " creating: Bigdata_hw2_datasets/q1/\n", " inflating: Bigdata_hw2_datasets/q1/stream_data_dgim.txt \n", " creating: Bigdata_hw2_datasets/q2/\n", " inflating: Bigdata_hw2_datasets/q2/games.csv \n", " inflating: Bigdata_hw2_datasets/q2/ratings.csv \n", " creating: Bigdata_hw2_datasets/q3/\n", " inflating: Bigdata_hw2_datasets/q3/c1.txt \n", " inflating: Bigdata_hw2_datasets/q3/c2.txt \n", " inflating: Bigdata_hw2_datasets/q3/data.txt \n" ] } ] }, { "cell_type": "code", "source": [ "import math\n", "import numpy as np\n", "import time\n", "\n", "INPUT_FILE = 'Bigdata_hw2_datasets/q1/stream_data_dgim.txt'" ], "metadata": { "id": "MnB4ZwTEcSNr" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "class DGIM:\n", " def __init__(self, file, N):\n", " \"\"\"\n", " Implementation of DGIM Algorithm\n", "\n", " Args:\n", " file (str): Input file path\n", " N (int): Window size\n", " \"\"\"\n", " self.file = file\n", " self.N = N\n", " self.ts = 0\n", " self.buckets = []\n", " self._init_buckets()\n", "\n", " def _init_buckets(self):\n", " \"\"\"\n", " Create the buckets according to the window size.\n", " \"\"\"\n", " for i in range(int(math.log(self.N, 2))+1):\n", " self.buckets.append([])\n", "\n", " def _old_bucket(self, k=0):\n", " \"\"\"\n", " Find the oldest bucket according to k.\n", "\n", " Args:\n", " k (int, optional): Defaults to N.\n", "\n", " Returns:\n", " tuple: old bucket number size and old bucket end-timestamp \n", " \"\"\"\n", " k = self.N if k == 0 else k\n", " obi = 0\n", " obt = 0\n", " for i in range(len(self.buckets)):\n", " for ets in self.buckets[i]:\n", " if ets >= self.ts - k:\n", " obi = i\n", " obt = ets\n", " else:\n", " return obi, obt\n", " return obi, obt\n", "\n", " def _update(self):\n", " \"\"\"\n", " Update the buckets based on the algorithm constraints.\n", " If we have more than 2 buckets of each size, merge them.\n", " \"\"\"\n", " for i in range(len(self.buckets)):\n", " if len(self.buckets[i]) > 2:\n", " self.buckets[i].pop()\n", " tmp = self.buckets[i].pop()\n", " if i != len(self.buckets) - 1:\n", " self.buckets[i+1].insert(0, tmp)\n", "\n", " def count(self, k=0):\n", " \"\"\"\n", " Count the ones in the last k bits.\n", "\n", " Args:\n", " k (int, optional): Defaults to N.\n", "\n", " Returns:\n", " int: count\n", " \"\"\"\n", " cnt = 0\n", " obi, obt = self._old_bucket(k)\n", " for i in range(len(self.buckets)):\n", " if i > obi:\n", " break\n", " for ets in self.buckets[i]:\n", " if ets > obt:\n", " cnt += 2**i\n", " elif ets == obt:\n", " cnt += int(0.5 * 2**i)\n", " return cnt\n", "\n", " def run(self):\n", " \"\"\"\n", " Read the file in the streaming mode and process.\n", " \"\"\"\n", " with open(self.file) as f:\n", " while True:\n", " x = f.read(2).strip()\n", " if not x:\n", " break\n", " self.ts += 1\n", " obi, obt = self._old_bucket()\n", " if obt == self.ts - self.N:\n", " self.buckets[obi].remove(obt)\n", " if x == \"1\":\n", " self.buckets[0].insert(0, self.ts)\n", " self._update()" ], "metadata": { "id": "3ZQ0MPd3eEff" }, "execution_count": 125, "outputs": [] }, { "cell_type": "code", "source": [ "start = time.time()\n", "\n", "dgim = DGIM(INPUT_FILE, 1000)\n", "dgim.run()\n", "result_1000 = dgim.count(1000)\n", " \n", "time1 = time.time() - start\n", "\n", "print(\"The number of 1 bits in the window with size 1000:\", result_1000)\n", "print(\"The running time of DGIM with window size 1000:\", time1)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vKPEe6GneKG5", "outputId": "a56a136c-30e0-49a8-9639-79be4222871f" }, "execution_count": 126, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The number of 1 bits in the window with size 1000: 508\n", "The running time of DGIM with window size 1000: 0.19089221954345703\n" ] } ] }, { "cell_type": "code", "source": [ "print(\"The number of 1 bits in the last 500 bits:\", dgim.count(500))\n", "print(\"The number of 1 bits in the last 200 bits:\", dgim.count(200))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Vi5f-E87I48I", "outputId": "4b182776-918a-46c3-84d2-b463601d0e77" }, "execution_count": 128, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The number of 1 bits in the last 500 bits: 220\n", "The number of 1 bits in the last 200 bits: 76\n" ] } ] }, { "cell_type": "code", "source": [ "start = time.time()\n", "\n", "data = np.loadtxt(INPUT_FILE, dtype=\"uint8\")\n", "acc_cnt = np.sum(data[data.shape[0] - 1000:])\n", "\n", "time2 = time.time() - start\n", "\n", "print(\"The accurate number of 1 bits in the window with size 1000:\", acc_cnt)\n", "print(\"The running time of accurate result with window size 1000:\", time2)\n", "print(\"The accuracy of DGIM with window size 1000:\", 1 - abs(result_1000 - acc_cnt) / acc_cnt)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ghwTYjRqJYOI", "outputId": "86ac5a98-3160-4f02-c2ef-d30f3daf9026" }, "execution_count": 129, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The accurate number of 1 bits in the window with size 1000: 391\n", "The running time of accurate result with window size 1000: 0.03008556365966797\n", "The accuracy of DGIM with window size 1000: 0.70076726342711\n" ] } ] } ] }