input_chunk: reduce number of required Chunks, optimize performance (#…

…2025) When an input plugin is under pressure, meaning that it tries to ingest data and the engine realize is alreader over it `mem_buf_limit`, if the file system storage is enabled that data will be 'down'. That approach worked but was not optimal for performance reasons. This patch makes sure to always lookup for new candidate chunks in reverse order and in addition it will only put down a Chunk that's under a memory limit scenario if it available capacity (available space) is less than 1%. === Performance Results === The following tests uses the Syslog input plugin (TCP mode) and our stress tool to ingest 100.000 records (10 times), e.g: flb-tcp-writer -c 1 -d syslog.log -r 100000 -p `pidof fluent-bit` -o 127.0.0.1:5150 == Before == records write (b) write secs | % cpu user (ms) sys (ms) Mem (bytes) Mem -------- ---------- -------- ----- + ------ --------- -------- ----------- ------- 100000 18524763 17.67M 1.19 | 31.85 340 40 33591296 32.04M 100000 18524763 17.67M 1.20 | 32.40 380 10 56164352 53.56M 100000 18524763 17.67M 1.26 | 32.54 390 20 75018240 71.54M 100000 18524763 17.67M 1.22 | 36.18 410 30 75284480 71.80M 100000 18524763 17.67M 1.23 | 39.77 480 10 74944512 71.47M 100000 18524763 17.67M 1.23 | 42.21 490 30 75100160 71.62M 100000 18524763 17.67M 1.25 | 45.52 540 30 75276288 71.79M 100000 18524763 17.67M 1.28 | 49.21 600 30 75440128 71.95M 100000 18524763 17.67M 1.28 | 50.00 630 10 75067392 71.59M 100000 18524763 17.67M 1.28 | 54.08 680 10 75313152 71.82M - Summary - Process : fluent-bit - PID : 28300 - Elapsed Time: 12.43 seconds - Avg Memory : 67.28M - Avg CPU : 41.38% - Avg Rate : 11.45M/sec - Storage layer total chunks : 349 |- mem chunks : 0 |- fs chunks : 349 |- up : 32 |- down : 317 == After == records write (b) write secs | % cpu user (ms) sys (ms) Mem (bytes) Mem -------- ---------- -------- ----- + ------ --------- -------- ----------- ------- 100000 18524763 17.67M 1.20 | 29.99 350 10 33251328 31.71M 100000 18524763 17.67M 1.28 | 32.82 400 20 55824384 53.24M 100000 18524763 17.67M 1.26 | 32.54 370 40 76103680 72.58M 100000 18524763 17.67M 1.22 | 35.36 400 30 76365824 72.83M 100000 18524763 17.67M 1.23 | 36.64 420 30 76369920 72.83M 100000 18524763 17.67M 1.21 | 37.12 430 20 74670080 71.21M 100000 18524763 17.67M 1.22 | 37.71 440 20 74715136 71.25M 100000 18524763 17.67M 1.22 | 37.71 440 20 74911744 71.44M 100000 18524763 17.67M 1.22 | 39.21 450 30 75055104 71.58M 100000 18524763 17.67M 1.23 | 39.09 450 30 75120640 71.64M - Summary - Process : fluent-bit - PID : 29672 - Elapsed Time: 12.29 seconds - Avg Memory : 67.33M - Avg CPU : 35.82% - Avg Rate : 11.56M/sec - Storage layer total chunks : 111 |- mem chunks : 0 |- fs chunks : 111 |- up : 32 |- down : 79 === Conclusion === After the optimization with a minimal load test, CPU time was reduced from 41.38% to 35.82%, data transfer and memory usage are almost the same. The other big difference besides CPU optimization, is the number of Chunks required to perform the same buffering: - Chunks needed before => 349 - Chunks needed after => 111 The results above are only shown to demonstrate the optimization, results must not be used as a number of overall performance, since different setups and load might get different results. On this test case Chunk Checksum was enabled. Signed-off-by: Eduardo Silva <[email protected]>
fluent · Mar 19, 2020 · 49b634f · 49b634f
1 parent 0d58955
commit 49b634f
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 4 deletions.
diff --git a/include/fluent-bit/flb_input_chunk.h b/include/fluent-bit/flb_input_chunk.h
@@ -26,7 +26,17 @@
 #include <monkey/mk_core.h>
 #include <msgpack.h>
 
-#define FLB_INPUT_CHUNK_SIZE 262144 /* 256KB (hint) */
+/*
+ * This variable defines a 'hint' size for new Chunks created, this
+ * value is passed to Chunk I/O.
+ */
+#define FLB_INPUT_CHUNK_SIZE 262144 /* 256KB (hint) */
+
+/*
+ * Defines a maximum size for a Chunk in the file system: note that despite
+ * this is considered a limit, a Chunk size might get greater than this.
+ */
+#define FLB_INPUT_CHUNK_FS_MAX_SIZE 2048000 /* 2MB */
 
 struct flb_input_chunk {
  int busy; /* buffer is being flushed */

diff --git a/src/flb_input_chunk.c b/src/flb_input_chunk.c
@@ -197,7 +197,7 @@ static struct flb_input_chunk *input_chunk_get(const char *tag, int tag_len,
  struct flb_input_chunk *ic = NULL;
 
  /* Try to find a current chunk context to append the data */
- mk_list_foreach(head, &in->chunks) {
+ mk_list_foreach_r(head, &in->chunks) {
  ic = mk_list_entry(head, struct flb_input_chunk, _head);
  if (ic->busy == FLB_TRUE || cio_chunk_is_locked(ic->chunk)) {
  ic = NULL;
@@ -391,6 +391,7 @@ int flb_input_chunk_append_raw(struct flb_input_instance *in,
 {
  int ret;
  int set_down = FLB_FALSE;
+ int min;
  size_t size;
  struct flb_input_chunk *ic;
  struct flb_storage_input *si;
@@ -469,7 +470,7 @@ int flb_input_chunk_append_raw(struct flb_input_instance *in,
  size = cio_chunk_get_content_size(ic->chunk);
 
  /* Lock buffers where size > 2MB */
- if (size > 2048000) {
+ if (size > FLB_INPUT_CHUNK_FS_MAX_SIZE) {
  cio_chunk_lock(ic->chunk);
  }
 
@@ -525,7 +526,20 @@ int flb_input_chunk_append_raw(struct flb_input_instance *in,
  if (flb_input_chunk_is_overlimit(in) == FLB_TRUE &&
  si->type == CIO_STORE_FS) {
  if (cio_chunk_is_up(ic->chunk) == CIO_TRUE) {
- cio_chunk_down(ic->chunk);
+ /*
+ * If we are already over limit, a sub-sequent data ingestion
+ * might need a Chunk to write data in. As an optimization we
+ * will put this Chunk down ONLY IF it has less than 1% of
+ * it capacity as available space, otherwise keep it 'up' so
+ * it available space can be used.
+ */
+ size = cio_chunk_get_content_size(ic->chunk);
+
+ /* Do we have less than 1% available ? */
+ min = (FLB_INPUT_CHUNK_FS_MAX_SIZE * 0.01);
+ if (FLB_INPUT_CHUNK_FS_MAX_SIZE - size < min) {
+ cio_chunk_down(ic->chunk);
+ }
  }
  return 0;
  }