-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
308 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites | ||
import tables, parseutils, strutils, threadpool | ||
|
||
const filename = "pagecounts-20160101-050000" | ||
|
||
type | ||
Stats = ref object | ||
projectName, pageTitle: string | ||
requests, contentSize: int | ||
|
||
proc `$`(stats: Stats): string = | ||
"(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [ | ||
stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize | ||
] | ||
|
||
proc parse(chunk: string): Stats = | ||
# Each line looks like: en Main_Page 242332 4737756101 | ||
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0) | ||
|
||
var projectName = "" | ||
var pageTitle = "" | ||
var requests = "" | ||
var contentSize = "" | ||
for line in chunk.splitLines: | ||
var i = 0 | ||
projectName.setLen(0) | ||
i.inc parseUntil(line, projectName, Whitespace, i) | ||
i.inc skipWhitespace(line, i) | ||
pageTitle.setLen(0) | ||
i.inc parseUntil(line, pageTitle, Whitespace, i) | ||
i.inc skipWhitespace(line, i) | ||
requests.setLen(0) | ||
i.inc parseUntil(line, requests, Whitespace, i) | ||
i.inc skipWhitespace(line, i) | ||
contentSize.setLen(0) | ||
i.inc parseUntil(line, contentSize, Whitespace, i) | ||
i.inc skipWhitespace(line, i) | ||
|
||
if requests.len == 0 or contentSize.len == 0: | ||
# Ignore lines with either of the params that are empty. | ||
continue | ||
|
||
let requestsInt = requests.parseInt | ||
if requestsInt > result.requests and projectName == "en": | ||
result = Stats( | ||
projectName: projectName, | ||
pageTitle: pageTitle, | ||
requests: requestsInt, | ||
contentSize: contentSize.parseInt | ||
) | ||
|
||
proc readChunks(filename: string, chunksize = 1000000): Stats = | ||
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0) | ||
var file = open(filename) | ||
var responses = newSeq[FlowVar[Stats]]() | ||
var buffer = newString(chunksize) | ||
var oldBufferLen = 0 | ||
while not endOfFile(file): | ||
let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen | ||
var chunkLen = readSize | ||
|
||
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines: | ||
# Find where the last line ends | ||
chunkLen.dec | ||
|
||
responses.add(spawn parse(buffer[0 .. <chunkLen])) | ||
oldBufferLen = readSize - chunkLen | ||
buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1] | ||
|
||
for resp in responses: | ||
let statistic = ^resp | ||
if statistic.requests > result.requests: | ||
result = statistic | ||
|
||
file.close() | ||
|
||
|
||
when isMainModule: | ||
echo readChunks(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
--threads:on |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites | ||
import tables, parseutils, strutils, threadpool, re | ||
|
||
const filename = "pagecounts-20160101-050000" | ||
|
||
type | ||
Stats = ref object | ||
projectName, pageTitle: string | ||
requests, contentSize: int | ||
|
||
proc `$`(stats: Stats): string = | ||
"(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [ | ||
stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize | ||
] | ||
|
||
proc parse(chunk: string): Stats = | ||
# Each line looks like: en Main_Page 242332 4737756101 | ||
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0) | ||
|
||
var matches: array[4, string] | ||
var reg = re"([^\s]+)\s([^\s]+)\s(\d+)\s(\d+)" | ||
for line in chunk.splitLines: | ||
|
||
let start = find(line, reg, matches) | ||
if start == -1: continue | ||
|
||
let requestsInt = matches[2].parseInt | ||
if requestsInt > result.requests and matches[0] == "en": | ||
result = Stats( | ||
projectName: matches[0], | ||
pageTitle: matches[1], | ||
requests: requestsInt, | ||
contentSize: matches[3].parseInt | ||
) | ||
|
||
proc readChunks(filename: string, chunksize = 1000000): Stats = | ||
result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0) | ||
var file = open(filename) | ||
var responses = newSeq[FlowVar[Stats]]() | ||
var buffer = newString(chunksize) | ||
var oldBufferLen = 0 | ||
while not endOfFile(file): | ||
let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen | ||
var chunkLen = readSize | ||
|
||
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines: | ||
# Find where the last line ends | ||
chunkLen.dec | ||
|
||
responses.add(spawn parse(buffer[0 .. <chunkLen])) | ||
oldBufferLen = readSize - chunkLen | ||
buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1] | ||
|
||
echo("Spawns: ", responses.len) | ||
for resp in responses: | ||
let statistic = ^resp | ||
if statistic.requests > result.requests: | ||
result = statistic | ||
|
||
file.close() | ||
|
||
|
||
when isMainModule: | ||
echo readChunks(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
--threads:on |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites | ||
import tables, parseutils, strutils | ||
|
||
const filename = "pagecounts-20150101-050000" | ||
|
||
proc parse(filename: string): tuple[projectName, pageTitle: string, | ||
requests, contentSize: int] = | ||
# Each line looks like: en Main_Page 242332 4737756101 | ||
var file = open(filename) | ||
for line in file.lines: | ||
var i = 0 | ||
var projectName = "" | ||
i.inc parseUntil(line, projectName, Whitespace, i) | ||
i.inc | ||
var pageTitle = "" | ||
i.inc parseUntil(line, pageTitle, Whitespace, i) | ||
i.inc | ||
var requests = 0 | ||
i.inc parseInt(line, requests, i) | ||
i.inc | ||
var contentSize = 0 | ||
i.inc parseInt(line, contentSize, i) | ||
if requests > result[2] and projectName == "en": | ||
result = (projectName, pageTitle, requests, contentSize) | ||
|
||
file.close() | ||
|
||
when isMainModule: | ||
echo parse(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os, parseutils, threadpool, strutils | ||
|
||
type | ||
Stats = ref object | ||
domainCode, pageTitle: string | ||
countViews, totalSize: int | ||
|
||
proc newStats(): Stats = | ||
Stats(domainCode: "", pageTitle: "", countViews: 0, totalSize: 0) | ||
|
||
proc `$`(stats: Stats): string = | ||
"(domainCode: $#, pageTitle: $#, countViews: $#, totalSize: $#)" % [ | ||
stats.domainCode, stats.pageTitle, $stats.countViews, $stats.totalSize | ||
] | ||
|
||
proc parse(line: string, domainCode, pageTitle: var string, | ||
countViews, totalSize: var int) = | ||
if line.len == 0: return | ||
var i = 0 | ||
domainCode.setLen(0) | ||
i.inc parseUntil(line, domainCode, {' '}, i) | ||
i.inc | ||
pageTitle.setLen(0) | ||
i.inc parseUntil(line, pageTitle, {' '}, i) | ||
i.inc | ||
countViews = 0 | ||
i.inc parseInt(line, countViews, i) | ||
i.inc | ||
totalSize = 0 | ||
i.inc parseInt(line, totalSize, i) | ||
|
||
proc parseChunk(chunk: string): Stats = | ||
result = newStats() | ||
var domainCode = "" | ||
var pageTitle = "" | ||
var countViews = 0 | ||
var totalSize = 0 | ||
for line in splitLines(chunk): | ||
parse(line, domainCode, pageTitle, countViews, totalSize) | ||
if domainCode == "en" and countViews > result.countViews: | ||
result = Stats(domainCode: domainCode, pageTitle: pageTitle, | ||
countViews: countViews, totalSize: totalSize) | ||
|
||
proc readPageCounts(filename: string, chunkSize = 1_000_000) = | ||
var file = open(filename) | ||
var responses = newSeq[FlowVar[Stats]]() | ||
var buffer = newString(chunksize) | ||
var oldBufferLen = 0 | ||
while not endOfFile(file): | ||
let reqSize = chunksize - oldBufferLen | ||
let readSize = file.readChars(buffer, oldBufferLen, reqSize) + oldBufferLen | ||
var chunkLen = readSize | ||
|
||
while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines: | ||
chunkLen.dec | ||
|
||
responses.add(spawn parseChunk(buffer[0 .. <chunkLen])) | ||
oldBufferLen = readSize - chunkLen | ||
buffer[0 .. <oldBufferLen] = buffer[readSize - oldBufferLen .. ^1] | ||
|
||
var mostPopular = newStats() | ||
for resp in responses: | ||
let statistic = ^resp | ||
if statistic.countViews > mostPopular.countViews: | ||
mostPopular = statistic | ||
|
||
echo("Most popular is: ", mostPopular) | ||
|
||
when isMainModule: | ||
const file = "pagecounts-20160101-050000" | ||
let filename = getCurrentDir() / file | ||
readPageCounts(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import threadpool | ||
|
||
var counter = 0 | ||
|
||
proc increment(x: int) = | ||
for i in 0 .. <x: | ||
let value = counter + 1 | ||
counter = value | ||
|
||
spawn increment(10_000) | ||
spawn increment(10_000) | ||
sync() | ||
echo(counter) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import os, parseutils | ||
|
||
proc parse(line: string, domainCode, pageTitle: var string, | ||
countViews, totalSize: var int) = | ||
var i = 0 | ||
domainCode.setLen(0) | ||
i.inc parseUntil(line, domainCode, {' '}, i) | ||
i.inc | ||
pageTitle.setLen(0) | ||
i.inc parseUntil(line, pageTitle, {' '}, i) | ||
i.inc | ||
countViews = 0 | ||
i.inc parseInt(line, countViews, i) | ||
i.inc | ||
totalSize = 0 | ||
i.inc parseInt(line, totalSize, i) | ||
|
||
proc readPageCounts(filename: string) = | ||
var domainCode = "" | ||
var pageTitle = "" | ||
var countViews = 0 | ||
var totalSize = 0 | ||
var mostPopular = ("", "", 0, 0) | ||
for line in filename.lines: | ||
parse(line, domainCode, pageTitle, countViews, totalSize) | ||
if domainCode == "en" and countViews > mostPopular[2]: | ||
mostPopular = (domainCode, pageTitle, countViews, totalSize) | ||
|
||
echo("Most popular is: ", mostPopular) | ||
|
||
when isMainModule: | ||
const file = "pagecounts-20160101-050000" | ||
let filename = getCurrentDir() / file | ||
readPageCounts(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import threadpool, locks | ||
|
||
var counterLock: Lock | ||
initLock(counterLock) | ||
var counter {.guard: counterLock.} = 0 | ||
|
||
proc increment(x: int) = | ||
for i in 0 .. <x: | ||
let value = counter + 1 | ||
counter = value | ||
|
||
spawn increment(10_000) | ||
spawn increment(10_000) | ||
sync() | ||
echo(counter) |