Skip to content

Commit

Permalink
Update Frequency Analysis and calculate frequencies offline
Browse files Browse the repository at this point in the history
  • Loading branch information
uSaiPrashanth committed Jun 9, 2023
1 parent 4128486 commit c13816f
Show file tree
Hide file tree
Showing 5 changed files with 252,577 additions and 252,664 deletions.
2 changes: 2 additions & 0 deletions working_dirs/orz/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Any C++ Output files
*.out
96 changes: 96 additions & 0 deletions working_dirs/orz/calculate_token_frequencies.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include<bits/stdc++.h>
#include "tqdm/tqdm.h"
using namespace std;

// Iterates over the pile and counts the number of times each token is memorized and not memorized
// Saves the results to a json file
int run_pile(vector<int>& indicies){
vector<long long> memorized = vector<long long>(60000, 0);
vector<long long> non_memorized = vector<long long>(60000, 0);

ifstream ios;
ios.open("/scratch/pile/standard/document.bin", ios::binary);
// Each sequence has 2049 tokens, every iteration during training has a global batch size of 1024
int length = 2049*1024*16;
uint16_t* buffer = new uint16_t[length];

int curr = 0;
printf("Starting Iteration\n");
for (int i = 0;i < 143000*1024; i += 1024*16){
ios.read((char*)buffer, length*sizeof(uint16_t));
for(int j = 0;j < 1024*16;j++){
bool is_mem = false;
if (i + j >= 143000*1024){
break;
}
if(indicies[curr] == (i + j)){
is_mem = true;
curr++;

if(curr == indicies.size()){
is_mem = false;
curr = 0;
}
}
else{
is_mem = false;
}

for(int k = 0;k < 2049;k++){
int data = buffer[j*2049+k];
if(is_mem){
memorized[data]++;
}
else{
non_memorized[data]++;
}
}
}
printf("Finished Iteration %d\n", i/1024);
}

// Save the results to a json file
ofstream savefile;
savefile.open("/fsx/orz/semantic-memorization/working_dirs/orz/standard_memorized_frequencies.json");
savefile << "{\n";
savefile << "\"memorized\": [";
for(int i = 0;i < 60000;i++){
savefile << memorized[i];
if(i != 59999){
savefile << ", ";
}
}
savefile << "],\n";
savefile << "\"non_memorized\": [";
for(int i = 0;i < 60000;i++){
savefile << non_memorized[i];
if(i != 59999){
savefile << ", ";
}
}
savefile << "]\n";
savefile << "}";
savefile.close();
return 0;
}


// Loads the indicies of the memorized tokens
int load_indicies(vector<int>& data){
fstream ios;
ios.open("/fsx/orz/memorization-evals/memorized_evals/mem_standard.txt");
int index;
while (ios >> index){
data.push_back(index);
}
sort(data.begin(), data.end());
ios.close();
return 0;
}

int main(){
vector<int> indicies;
load_indicies(indicies);
run_pile(indicies);
return 0;
}
4 changes: 4 additions & 0 deletions working_dirs/orz/deduped_memorized_frequencies.json

Large diffs are not rendered by default.

Loading

0 comments on commit c13816f

Please sign in to comment.