From 4c42eea0d775293898e38cb1265ae2e905d263f6 Mon Sep 17 00:00:00 2001 From: ruflin Date: Thu, 24 May 2018 10:18:29 +0200 Subject: [PATCH] Initial commit: ECS Version 0.1.0 --- .gitignore | 3 + CHANGELOG.md | 13 + CONTRIBUTING.md | 73 +++ Makefile | 50 ++ README.md | 536 ++++++++++++++++++++ TODO.md | 8 + docs/about.md | 76 +++ docs/implementing.md | 35 ++ docs/intro.md | 23 + docs/use-cases-header.md | 4 + schema.csv | 140 ++++++ schemas/agent.yml | 40 ++ schemas/base.yml | 50 ++ schemas/cloud.yml | 49 ++ schemas/container.yml | 35 ++ schemas/destination.yml | 36 ++ schemas/device.yml | 50 ++ schemas/error.yml | 25 + schemas/event.yml | 119 +++++ schemas/file.yml | 83 ++++ schemas/geoip.yml | 30 ++ schemas/host.yml | 88 ++++ schemas/kubernetes.yml | 34 ++ schemas/log.yml | 23 + schemas/network.yml | 51 ++ schemas/organization.yml | 20 + schemas/process.yml | 42 ++ schemas/service.yml | 63 +++ schemas/source.yml | 38 ++ schemas/url.yml | 89 ++++ schemas/user.yml | 33 ++ schemas/user_agent.yml | 59 +++ scripts/helper.py | 139 ++++++ scripts/requirements.txt | 1 + scripts/schemas.py | 104 ++++ scripts/template.go | 64 +++ scripts/use-cases.py | 63 +++ template.json | 717 +++++++++++++++++++++++++++ use-cases/README.md | 11 + use-cases/apm.md | 21 + use-cases/apm.yml | 57 +++ use-cases/auditbeat.md | 28 ++ use-cases/auditbeat.yml | 147 ++++++ use-cases/beats.md | 18 + use-cases/beats.yml | 38 ++ use-cases/filebeat-apache-access.md | 23 + use-cases/filebeat-apache-access.yml | 113 +++++ use-cases/logging.md | 20 + use-cases/logging.yml | 69 +++ use-cases/metricbeat.md | 30 ++ use-cases/metricbeat.yml | 147 ++++++ 51 files changed, 3828 insertions(+) create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTING.md create mode 100644 Makefile create mode 100644 README.md create mode 100644 TODO.md create mode 100644 docs/about.md create mode 100644 docs/implementing.md create mode 100644 docs/intro.md create mode 100644 docs/use-cases-header.md create mode 100644 schema.csv create mode 100644 schemas/agent.yml create mode 100644 schemas/base.yml create mode 100644 schemas/cloud.yml create mode 100644 schemas/container.yml create mode 100644 schemas/destination.yml create mode 100644 schemas/device.yml create mode 100644 schemas/error.yml create mode 100644 schemas/event.yml create mode 100644 schemas/file.yml create mode 100644 schemas/geoip.yml create mode 100644 schemas/host.yml create mode 100644 schemas/kubernetes.yml create mode 100644 schemas/log.yml create mode 100644 schemas/network.yml create mode 100644 schemas/organization.yml create mode 100644 schemas/process.yml create mode 100644 schemas/service.yml create mode 100644 schemas/source.yml create mode 100644 schemas/url.yml create mode 100644 schemas/user.yml create mode 100644 schemas/user_agent.yml create mode 100644 scripts/helper.py create mode 100644 scripts/requirements.txt create mode 100644 scripts/schemas.py create mode 100644 scripts/template.go create mode 100644 scripts/use-cases.py create mode 100644 template.json create mode 100644 use-cases/README.md create mode 100644 use-cases/apm.md create mode 100644 use-cases/apm.yml create mode 100644 use-cases/auditbeat.md create mode 100644 use-cases/auditbeat.yml create mode 100644 use-cases/beats.md create mode 100644 use-cases/beats.yml create mode 100644 use-cases/filebeat-apache-access.md create mode 100644 use-cases/filebeat-apache-access.yml create mode 100644 use-cases/logging.md create mode 100644 use-cases/logging.yml create mode 100644 use-cases/metricbeat.md create mode 100644 use-cases/metricbeat.yml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..99aaab979c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.DS_Store +*.pyc +env diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000..6d50b4f4ea --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,13 @@ +# Change Log +All notable changes to this project will be documented in this file based on the [Keep a Changelog](http://keepachangelog.com/) Standard. This project adheres to [Semantic Versioning](http://semver.org/). + + +## [Unreleased](https://github.com/elastic/ecs/compare/0.1.0...master) + +### Breaking changes + +### Bugfixes + +### Added + +### Deprecated diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..56cdf24f92 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,73 @@ +# Contributing to Elastic Common Schema (ECS) + +All information related to ECS is versioned in this repository. All changes to ECS +happen through Git and Pull Requests. + + +## Requirements + +To contribute to ECS the following tools are expected to be running on your machine: + +* [Git](https://git-scm.com/) +* [Python](https://www.python.org/) +* [Go](https://golang.org/) + +## Steps to contribute + +To contribute changes to ECS follow the steps below: + +* Create [your own fork](https://help.github.com/articles/fork-a-repo/) of the ECS repo. +* Clone your fork to your machine. +* Run `git checkout -b branch-name` and replace `branch-name` according to your change +* Apply your changes to the `.yml` files as needed +* Run `make` to update generated files like `schema.csv` and `schema.md` +* Run `git commit -a -m "your message"` +* Run `git push --set-upstream origin branch-name` (assumes your clone remote is called `origin`) +* Create a [Pull Request](https://help.github.com/articles/creating-a-pull-request/) against the ECS repo. (if you go to the website, it should pop up as a button with your branch directly) +* Wait for reviews on your PR and collaborate. + +Notes: Make sure to always only push changes against your own fork. + + +## Fields.yml + +`fields.yml` files are used to describe the Elastic Common Schema in a structured way. These files allow to generate an Elasticsearch index template, Kibana index pattern or documentation output out of it in an automated way. + +The structure of the of each document looks as following: + +``` +- name: agent + title: Agent fields + group: 2 + description: > + The agent fields contain all the data about the agent/client/shipper that collected / generated the events. + + As an example in case of beats for logs this is `agent.name` is `filebeat`. + fields: + - name: version + type: keyword + description: > + Agent version. + example: 6.0.0-rc2 + phase: 0 +``` + +Each namespace has it's own file to keep the files itself small. Each namespace contains a list of fields which has all the fields inside. `title` and `description` are used to describe the namespace. `level` is for pure sorting purpose in the documentation output. + +Each field under `fields` has first the field `name`. The `type` is the [Elasticsearch field type](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html). `description` is used to add details about the field itself. With `example` an example value can be provided. The `phase` field is used to indicate in which `phase` the current field is (more about this below). In case `phase` is left out, it defaults to 0. + +## Phases + +The goal of the phase value for each field is to indicate if a field is already part of the standard or not. Different phases exist to make it easy to contribute new fields but still be able to iterate on top of it. The phases are defined as following: + +* 0 (alpha): The field is new and is up for discussion if it should be added. The field might be removed at any time again. +* 1 (beta): It's clear that there is value of having the field in ECS and discussions about naming / namespaces etc. started. It's unlikely that the field is removed again but naming might change at any time. +* 2 (rc): The field has been accepted and is unlikely to change. It is now tested in the field. +* 3 (GA): The field is part of ECS and breaking changes to it happen only on major releases. + +## Notes + +### Host vs Hostname + +* Host can contain hostname and port hostname:port +* Hostname never contains a port number diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..f647868265 --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +generate: + python scripts/schemas.py + $(MAKE) readme + $(MAKE) template + +generate3: + python3 scripts/schemas.py + python3 scripts/use-cases.py + +fmt: + find . -name *.py -exec autopep8 --in-place --max-line-length 120 {} \; + +check: + # + # Validate that all generated changes are commited + $(MAKE) generate + git diff | cat + git update-index --refresh + git diff-index --exit-code HEAD -- + + # Check python code + find . -name *.py -exec autopep8 --in-place --max-line-length 120 {} \; | \ + (! grep . -q) || (echo "Code differs from autopep8's style" && false) + + # Basic spell checking + go get github.com/client9/misspell + misspell README.md CONTRIBUTING.md + +setup: + pip install -Ur ./scripts/requirements.txt + +setup3: + pip3 install -Ur ./scripts/requirements.txt + +clean: + rm schema.csv schema.md + # Clean all markdown files for use-cases + find ./use-cases -type f -name '*.md' -not -name 'README.md' -print0 | xargs -0 rm -- + +readme: + cat docs/intro.md > README.md + python scripts/schemas.py --stdout=true >> README.md + cat docs/use-cases-header.md >> README.md + python scripts/use-cases.py --stdout=true >> README.md + cat docs/implementing.md >> README.md + cat docs/about.md >> README.md + +template: + go get github.com/elastic/go-ucfg/yaml + go run scripts/template.go > ./template.json diff --git a/README.md b/README.md new file mode 100644 index 0000000000..8cce7d8f74 --- /dev/null +++ b/README.md @@ -0,0 +1,536 @@ +**WARNING: THIS IS WORK IN PROGRESS** + +# Elastic Common Schema (ECS) + +The Elastic Common Schema (ECS) is used to provide a common data model when +ingesting data into Elasticsearch. Having a common schema allows you correlate +data from sources like logs and metrics or IT operations +analytics and security analytics. + +ECS is still under development and backward compatibility is not guaranteed. Any +feedback on the general structure, missing fields, or existing fields is appreciated. +For contributions please read the [Contributing Guide](CONTRIBUTING.md). + +The current version of ECS is `0.1.0`. + +* [Fields](#fields) +* [Use cases](#use-cases) +* [Implementing ECS](#implementing-ecs) +* [About ECS](#about-ecs) + +# Fields + +List of available ECS fields. + * [Base fields](#base) + * [Agent fields](#agent) + * [Cloud fields](#cloud) + * [Container fields](#container) + * [Destination fields](#destination) + * [Device fields](#device) + * [Error fields](#error) + * [Event fields](#event) + * [File fields](#file) + * [Geoip fields](#geoip) + * [Host fields](#host) + * [Kubernetes fields](#kubernetes) + * [Log fields](#log) + * [Network fields](#network) + * [Organization fields](#organization) + * [Process fields](#process) + * [Service fields](#service) + * [Source fields](#source) + * [URL fields](#url) + * [User fields](#user) + * [User agent fields](#user_agent) + +## Base fields + +The base set contains all fields which are on the top level without a namespace. + +These are fields which are common across all types of events. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `@timestamp` | Timestamp when the event was created.
For log events this is expected to be when the event was generated and not when it was read.
Timestamp is a required field and must exist in all events. | date | | `2016-05-23T08:05:34.853Z` | +| `tags` | Tags is a list of keywords which are used to tag each event. | keyword | | `["production", "env2"]` | +| `labels` | Labels is an object which contains key/value pairs.
Labels can be used to add additional meta information to events. Label should not contain nested objects and all values are stored as keyword.
An example usage is the docker and k8s labels. | object | | `{key1: value1, key2: value2}` | +| `message` | For log events the message field contains the log message.
In other use cases the message field can be used to concatenate together different values which are then freely searchable. Or if multiple messages exist they can be combined here into one message. | text | | `Hello World` | + + +## Agent fields + +The agent fields contains the data about the agent/client/shipper that created the event. + +As an example in case of Beats for logs the `agent.name` is `filebeat`. In the case of APM it is the agent running in the app / service. The agent information does not change if data is sent through queuing system like Kafka, Redis, or processing systems like Logstash or APM Server. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `agent.version` | Agent version. | keyword | | `6.0.0-rc2` | +| `agent.name` | Agent name.
Name of the agent. | keyword | | `filebeat` | +| `agent.id` | Unique identifier of this agent if one exists.
In the case of Beats this would be beat.id. | keyword | | `8a4f500d` | +| `agent.ephemeral_id` | Ephemeral identifier of this agent if one exists.
This id compared to id normally changes across restarts. | keyword | | `8a4f500f` | + + +## Cloud fields + +All fields related to the cloud or infrastructure the events are coming from. + +In case Metricbeat is running on an EC2 host and fetches data from its host, the cloud info is expected to contain the data about this machine. In the case Metricbeat runs outside the cloud on a remote machine and fetches data from a service running in the cloud it is expected to have the cloud data from the machine on which the service is running in. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `cloud.provider` | Name of the cloud provider. Example values are ec2, gce, or digitalocean. | keyword | | `ec2` | +| `cloud.availability_zone` | Availability zone in which this host is running. | keyword | | `us-east-1c` | +| `cloud.region` | Region in which this host is running. | keyword | | `us-east-1` | +| `cloud.instance.id` | Instance ID of the host machine. | keyword | | `i-1234567890abcdef0` | +| `cloud.instance.name` | Instance name of the host machine. | keyword | | | +| `cloud.machine.type` | Machine type of the host machine. | keyword | | `t2.medium` | + + +## Container fields + +Container fields are used for meta information about the specific container the information is coming from. This should help to correlate data based containers from any runtime. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `container.runtime` | Runtime managing this container. | keyword | | `docker` | +| `container.id` | Unique container id. | keyword | | | +| `container.image.name` | Name of the image the container was built on. | keyword | | | +| `container.image.tag` | Container image tag. | keyword | | | +| `container.name` | Container name. | keyword | | | +| `container.labels` | Image labels. | object | | | + + +## Destination fields + +Destination fields describe details about the destination of a packet/event. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `destination.ip` | IP address of the destination.
This can be on or multiple IPv4 or IPv6 addresses. | ip | | | +| `destination.hostname` | Hostname of the destination. | keyword | | | +| `destination.port` | Port of the destination. | long | | | +| `destination.mac` | MAC address of the destination. | keyword | | | +| `destination.domain` | Destination domain. | keyword | | | +| `destination.subdomain` | Destination subdomain. | keyword | | | + + +## Device fields + +Device fields are used to give additional information about the device that the information is coming from. + +This could be a firewall, network device, etc. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `device.mac` | MAC address of the device | keyword | | | +| `device.ip` | IP address of the device. | ip | | | +| `device.hostname` | Hostname of the device. | keyword | | | +| `device.vendor` | Device vendor information. | text | | | +| `device.version` | Device version. | keyword | | | +| `device.serial_number` | Device serial number. | keyword | | | +| `device.timezone.offset.sec` | Timezone offset of the host in seconds.
Number of seconds relative to UTC. In case the offset is -01:30 the value will be -5400. | long | | `-5400` | +| `device.type` | The type of the device the data is coming from.
There is no predefined list of device types. Some examples are `endpoint`, `firewall`, `ids`, `ips`, `proxy`. | keyword | | `firewall` | + + +## Error fields + +Error namespace + +This can be used to represent all kinds of errors. It can be for errors that happen while fetching events or if the event itself contains an error. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `error.id` | Unique identifier for the error. | keyword | | | +| `error.message` | Error message. | text | | | +| `error.code` | Error code describing the error. | keyword | | | + + +## Event fields + +The event fields are used for context information about the data itself. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `event.id` | Unique ID to describe the event. | keyword | | `8a4f500d` | +| `event.category` | Event category.
This can be a user defined category. | keyword | | `metrics` | +| `event.type` | A type given to this kind of event which can be used for grouping.
This is normally defined by the user. | keyword | | `nginx-stats-metrics` | +| `event.module` | Name of the module this data is coming from.
This information is coming from the modules used in Beats or Logstash. | keyword | | `mysql` | +| `event.dataset` | Name of the dataset.
The concept of a `dataset` (fileset / metricset) is used in Beats as a subset of modules. It contains the information which is currently stored in metricset.name and metricset.module or fileset.name. | keyword | | `stats` | +| `event.severity` | Severity describes the severity of the event. What the different severity values mean can very different between use cases. It's up to the implementer to make sure severities are consistent across events. | long | | `7` | +| `event.raw` | Raw text message of entire event to be used to demonstrate log integrity. | keyword | | `Sep 19 08:26:10 host CEF:0|Security| threatmanager|1.0|100| worm successfully stopped|10|src=10.0.0.1 dst=2.1.2.2spt=1232` | +| `event.hash` | Hash (perhaps logstash fingerprint) of raw field to be able to demonstrate log integrity. | keyword | | `123456789012345678901234567890ABCD` | +| `event.version` | The version field contains the version an event for ECS adheres to.
This field should be provided as part of each event to make it possible to detect to which ECS version an event belongs.
event.version is a required field and must exist in all events. It describes which ECS version the event adheres to.
The current version is 0.1.0. | keyword | | `0.1.0` | +| `event.duration` | Duration of the event in nanoseconds. | long | | | +| `event.created` | event.created contains the date when the event was created.
This timestamp is distinct from @timestamp in that @timestamp contains the processed timestamp. For logs these two timestamps can be different as the timestamp in the log line and when the event is read for example by Filebeat are not identical. `@timestamp` must contain the timestamp extracted from the log line, event.created when the log line is read. The same could apply to package capturing where @timestamp contains the timestamp extracted from the network package and event.created when the event was created.
In case the two timestamps are identical, @timestamp should be used. | date | | | +| `event.risk_score` | Risk score value of the event. | float | | | + + +## File fields + +File attributes. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `file.path` | The path to the file. | text | | | +| `file.path.raw` | The path to the file. This is a non-analyzed field that is useful for aggregations. | keyword | 1 | | +| `file.target_path` | The target path for symlinks. | text | | | +| `file.target_path.raw` | The path to the file. This is a non-analyzed field that is useful for aggregations. | keyword | 1 | | +| `file.extension` | The file extension.
This should allow easy filtering by file extensions. | keyword | | `png` | +| `file.type` | The file type (file, dir, or symlink). | keyword | | | +| `file.device` | The device. | keyword | | | +| `file.inode` | The inode representing the file in the filesystem. | keyword | | | +| `file.uid` | The user ID (UID) or security identifier (SID) of the file owner. | keyword | | | +| `file.owner` | The file owner's username. | keyword | | | +| `file.gid` | The primary group ID (GID) of the file. | keyword | | | +| `file.group` | The primary group name of the file. | keyword | | | +| `file.mode` | The mode of the file in octal representation. | keyword | | `416` | +| `file.size` | The file size in bytes (field is only added when `type` is `file`). | long | | | +| `file.mtime` | The last modified time of the file (time when content was modified). | date | | | +| `file.ctime` | The last change time of the file (time when metadata was changed). | date | | | + + +## Geoip fields + +Geoip fields are for used for geo information for an ip address. + +The conversion to geoip information can be done by the Elasticsearch geoip plugin. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `geoip.continent_name` | The name of the continent. | keyword | | | +| `geoip.country_iso_code` | Country ISO code. | keyword | | | +| `geoip.location` | The longitude and latitude. | geo_point | | | +| `geoip.region_name` | The region name. | keyword | | | +| `geoip.city_name` | The city name. | keyword | | | + + +## Host fields + +All fields related to a host. A host can be a physical machine, a virtual machine, and also a Docker container. + +Normally the host information is related to the machine on which the event was generated / collected but also can be used differently if needed. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `host.timezone.offset.sec` | Timezone offset of the host in seconds.
Number of seconds relative to UTC. In case the offset is -01:30 the value will be -5400. | long | | `-5400` | +| `host.name` | host.name is the hostname of the host.
It can contain what `hostname` returns on Unix systems, the fully qualified domain name or also a name specified by the user. It is up to the sender to decide which value to use. | keyword | | | +| `host.id` | Unique host id.
As hostname is not always unique, this often can be configured by the user. An example here is the current usage of `beat.name`. | keyword | | | +| `host.ip` | Host ip address. | ip | | | +| `host.mac` | Host mac address. | keyword | | | +| `host.type` | This is the type of the host.
For Cloud providers this can be the machine type like `t2.medium`. Or it vm, container for example or something user defined. | keyword | | | +| `host.os.platform` | Operating system platform (e.g. centos, ubuntu, windows). | keyword | | `darwin` | +| `host.os.name` | Operating system name. | keyword | | `Mac OS X` | +| `host.os.family` | OS family (e.g. redhat, debian, freebsd, windows). | keyword | | `debian` | +| `host.os.version` | Operating system version. | keyword | | `10.12.6` | +| `host.architecture` | Operating system architecture. | keyword | | `x86_64` | + + +## Kubernetes fields + +Kubernetes fields are used for meta information about k8s. This should help to correlate data coming out of k8s setups. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `kubernetes.pod.name` | Kubernetes pod name | keyword | | | +| `kubernetes.namespace` | Kubernetes namespace | keyword | | | +| `kubernetes.labels` | Kubernetes labels map | object | | | +| `kubernetes.annotations` | Kubernetes annotations map | object | | | +| `kubernetes.container.name` | Kubernetes container name. This name is unique within the pod only, it's different from underlying container name (container.name in ECS) | keyword | | | + + +## Log fields + +Fields which are specific to log events. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `log.level` | Log level of the log event.
Some examples are `WARN`, `ERR`, `INFO`. | keyword | | `ERR` | +| `log.line` | Line number the log event was collected from. | long | | `18` | +| `log.offset` | Offset of the beginning of the log event. | long | | `12` | + + +## Network fields + +All fields related to network data. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `network.protocol` | Network protocol name. | keyword | | `http` | +| `network.direction` | Direction of the network traffic.
The recommended values are:
* inbound
* outbound
* unknown | keyword | | `inbound` | +| `network.forwarded_ip` | forwarded_ip indicates the host IP address when the source IP address is the proxy. | ip | | `192.1.1.2` | +| `network.inbound.bytes` | Network inbound bytes. | long | | `184` | +| `network.inbound.packets` | Network inbound packets. | long | | `12` | +| `network.outbound.bytes` | Network outbound bytes. | long | | `184` | +| `network.outbound.packets` | Network outbound packets. | long | | `12` | + + +## Organization fields + +The organization namespace can be used to enrich data with information from which organization the data belongs. + +This can be useful if data should stored in the same index should be sometimes filtered or organized by one or multiple organizations. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `organization.name` | Organization name. | text | | | +| `organization.id` | Unique identifier for the organization. | keyword | | | + + +## Process fields + +These fields contain information about a process. + +If metrics information is collected for a process and a process id / name shows up in a log message, these fields should help to correlated the two. It is expected that the `process.pid` will often also stay in the metric itself and only copied to the global field for correlation. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `process.args` | Process arguments.
May be filtered to protect sensitive information. | keyword | | `['-l', 'user', '10.0.0.16']` | +| `process.name` | Process name.
This is sometimes also known as program name or similar. | keyword | | `ssh` | +| `process.pid` | Process id. | long | | | +| `process.ppid` | Process parent id. | long | | | +| `process.title` | Process title.
The proctitle, often the same as process name. | keyword | | | + + +## Service fields + +The service fields describe the service for / from which the data was collected. + +If logs or metrics are collected from Redis, `service.name` would be `redis`. This allows to find and correlate logs for a specific service and even version with `service.version`. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `service.id` | Unique identifier of the running service.
This id should uniquely identify this service. This makes it possible to correlate logs and metrics for one specific service. For example in case of issues with one redis instance, it's possible to filter on the id to see metrics and logs for this single instance. | keyword | | `d37e5ebfe0ae6c4972dbe9f0174a1637bb8247f6` | +| `service.name` | Name of the service data is collected from.
The name can be used to group logs and metrics together from one service and correlate them. | keyword | | `elasticsearch` | +| `service.type` | Service type. | keyword | | | +| `service.state` | Current state of the service. | keyword | | | +| `service.version` | Version of the service the data was collected from.
This allows to look at a data set only for a specific version of a service. | keyword | | `3.2.4` | +| `service.ephemeral_id` | Ephemeral identifier of this service if one exists.
This id compared to id normally changes across restarts. | keyword | | `8a4f500f` | + + +## Source fields + +Source fields describe details about the source of where the event is coming from. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `source.ip` | IP address of the source.
This can be on or multiple IPv4 or IPv6 addresses. | ip | | | +| `source.hostname` | Hostname of the source. | keyword | | | +| `source.port` | Port of the source. | long | | | +| `source.mac` | MAC address of the source. | keyword | | | +| `source.domain` | Source domain. | keyword | | | +| `source.subdomain` | Source subdomain. | keyword | | | + + +## URL fields + +A complete URL, with scheme, host, and path. + +The URL object can be reused in other prefixes like `host.url.*` for example. It is important that whenever URL is used that the same structure is used. + +`url.href` is a [multi field](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/multi-fields.html#_multi_fields_with_multiple_analyzers) which means the data is stored as keyword `url.href` and test `url.href.analyzed`. The advantage of this is that for running a query against only a part of the url still works without having to split up the URL in all its part on ingest time. + +Based on whatwg URL definition: https://github.com/whatwg/url/issues/337 + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `url.href` | href contains the full url. The field is stored as keyword.
`href` is an analyzed field so the parsed information can be accessed through `href.analyzed` in queries. | keyword | | `https://elastic.co:443/search?q=elasticsearch#top` | +| `url.href.analyzed` | | text | 1 | | +| `url.protocol` | The protocol of the request, e.g. "https:". | keyword | | | +| `url.hostname` | The hostname of the request, e.g. "example.com".
For correlation the this field can be copied into the `host.name` field. | keyword | | | +| `url.port` | The port of the request, e.g. 443. | keyword | | | +| `url.pathname` | The path of the request, e.g. "/search". | text | | | +| `url.pathname.raw` | The url path. This is a non-analyzed field that is useful for aggregations. | keyword | 1 | | +| `url.search` | The search describes the query string of the request, e.g. "q=elasticsearch". | text | | | +| `url.search.raw` | The url search part. This is a non-analyzed field that is useful for aggregations. | keyword | 1 | | +| `url.hash` | The hash of the request URL, e.g. "top". | keyword | | | +| `url.username` | The username of the request. | keyword | | | +| `url.password` | The password of the request. | keyword | | | +| `url.extension` | The url extension field contains the extension of the file associated with the url.
A simple example is `http://localhost/logo.png` where the extension would be `png`. There can also be more complex cases like `http://localhost/content?asset=logo.png&token=XYZ` where the extension could also be `png` but depends on the implementation.
The `extension` field should be left out if the extension is not defined. | keyword | | `png` | + + +## User fields + +The user fields are used to describe user information as part of the event. + +All fields in user can have one or multiple entries. If a user has more then one id, an array with the ids must be provided. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `user.id` | One or multiple unique identifiers of the user. | keyword | | | +| `user.name` | Name of the user.
As the field is a keyword, the field will not be tokenized. | keyword | | | +| `user.email` | User email address. | keyword | | | +| `user.hash` | Unique user hash to correlate information for a user in anonymized form.
This is useful in case `user.id` or `user.name` cannot be used because it contains confidential information. | keyword | | | + + +## User agent fields + +The user_agent fields are normally coming from a browser request. + +These are common to show up in web service logs coming from the parsed user agent string. + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| `user_agent.raw` | Unparsed version of the user_agent. | text | | | +| `user_agent.device` | The name of the physical device. | keyword | | | +| `user_agent.version` | Version of the physical device. | keyword | | | +| `user_agent.major` | The major version of the user agent. | long | | | +| `user_agent.minor` | The minor version of the user agent. | long | | | +| `user_agent.patch` | The patch version of the user agent. | keyword | | | +| `user_agent.name` | The name of the user agent. | keyword | | `Chrome` | +| `user_agent.os.name` | The name of the operating system. | keyword | | | +| `user_agent.os.version` | Version of the operating system. | keyword | | | +| `user_agent.os.major` | The major version of the operating system. | long | | | +| `user_agent.os.minor` | The minor version of the operating system. | long | | | +| `user_agent.os.name` | The name of the operating system. | keyword | | | + + + + + +# Use cases + +Below are some examples that demonstrate how ECS fields can be applied to +specific use cases. + + * [Logging](https://github.com/elastic/ecs/blob/master/use-cases/logging.md) + * [APM](https://github.com/elastic/ecs/blob/master/use-cases/apm.md) + * [Filebeat Apache](https://github.com/elastic/ecs/blob/master/use-cases/filebeat-apache-access.md) + * [Beats](https://github.com/elastic/ecs/blob/master/use-cases/beats.md) + * [Auditbeat](https://github.com/elastic/ecs/blob/master/use-cases/auditbeat.md) + * [Metricbeat](https://github.com/elastic/ecs/blob/master/use-cases/metricbeat.md) + + + +# Implementing ECS + +## Adhere to ECS + +The following rules apply if an event wants to adhere to ECS + +* The document MUST have the `@timestamp` field. +* The [data type](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/mapping-types.html) defined for an ECS field MUST be used. +* It SHOULD have the field `event.version` to define which version of ECS it uses. + +To make the most out of ECS as many fields as possible should be mapped to ECS. + +## Rules + +ECS follows the following writing and naming rules for the fields. The goal of +these rules is to make the fields easy to remember and have a guide when new +fields are added. + +Often events will contain additional fields besides ECS. These can follow the +the same naming and writing rules but don't have to. + +**Writing** + +* All fields must be lower case +* No special characters except `_` +* Words are combined through underscore + +**Naming** + +* Use present tense unless field describes historical information. +* Use singular and plural names properly to reflect the field content. For example, use `requests_per_sec` rather than `request_per_sec`. +* Organise the prefixes from general to specific to allow grouping fields into objects with a prefix like `host.*`. +* Avoid stuttering of words. If part of the field name is already in the prefix, do not repeat it. Example: `host.host_ip` should be `host.ip`. +* Fields must be prefixed except for the base fields. For example all `host` fields are prefixed with `host.`. See `dot` notation in FAQ for more details. +* Do not use abbreviations (few exceptions like `ip` exist) + +# About ECS + +## Scope + +The Elastic Common Schema defines a common set of document fields (and their respective field names) to be used in event messages stored in Elasticsearch as part of any logging or metrics use case of the Elastic Stack, including IT operations analytics and security analytics. + +## Goals + +The ECS has the following goals: + +* Correlate data between metrics, logs and APM +* Correlate data coming from the same machines / hosts +* Correlate data coming from the same service + +Priority on which fields are added is based on these goals. + + +## Benefits + +The benefits to a user adopting these fields and names in their clusters are: + +- Ability to simply correlate data from different data sources +- Improved ability to remember commonly used field names (since there is only a single set, not a set per data source) +- Improved ability to deduce unremembered field names (since the field naming follows a small number of rules with few exceptions) +- Ability to re-use analysis content (searches, visualizations, dashboards, alerts, reports, and ML jobs) across multiple data sources +- Ability to use any future Elastic-provided analysis content in their environment without modifications + + +## FAQ + +### Why is ECS using a dot nation instead of an underline notation? + +There are two common formats on how keys are formatted when ingesting data into Elasticsearch: + +* Dot notation: `user.firstname: Nicolas`, `user.lastname: Ruflin` +* Underline notation: `user_firstname: Nicolas`, `user_lastname: Ruflin` + +In ECS the decision was made to use the dot notation and this entry is intended to share some background on this decision. + +**What is the difference between the two notations?** + +When ingesting `user.firstname` and `user.lastname` it is identical to ingesting the following JSON: + +``` +"user": { + "firstname": "Nicolas", + "lastname": "Ruflin" +} +``` + +This means internally in Elasticsearch `user` is represented as an [object datatype](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/object.html). In the case of the underline notation both are just [string datatypes](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html). + +NOTE: ECS does not used [nested datatypes](https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html) which is an array of objects. + +**Advantages of dot notation** + +The advantage of the dot notation is that on the Elasticsearch side each prefix is an object. Each object can have [parameters](https://www.elastic.co/guide/en/elasticsearch/reference/current/object.html#object-params) on how fields inside the object should be treated, for example if they should be index or mappings should be extended. In the context of ECS this allows for example to disable dynamic property creation for certain prefixes. + +On the ingest side of Elasticsearch it makes it simpler to for example drop complete objects with the remove processor instead of selecting each key inside it. It does not require prior knowledge which keys will end up in the object. + +On the event producing side like in Beats it simplifies the creation of the events as on the code side each object can be treated as an object (or struct in Golang as an example) which makes constructing and modifying each part of the final event easier. + +**Disadvantage of dot notation** + +In Elasticsearch each key can only have one type. So if `user` is an object it's not possible to have in the same index `user` as type `keyword` like `{"user": "nicolas ruflin"}`. This can be an issue in certain datasets. + +For the ECS data itself this is not an issue as all fields are predefined. + +**What if I already use the underline notation?** + +It's not a problem to mix the underline notation with the ECS do notation. They can coexist in the same document as long as there are not conflicts. + +**I have conflicting fields with ECS?** + +Assuming you already have a field user but ECS uses `user` as an object, you can use the [rename processor](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/rename-processor.html) on ingest time to rename your field to either the matching ECS field or rename it to `user.value` instead if your field does not match ECS. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000000..1d654610b2 --- /dev/null +++ b/TODO.md @@ -0,0 +1,8 @@ +TODO + +* Decide on schema name (Elastic Common Schema) +* Document verified fields +* How can we link solutions and format + * Format should not know about solutions, but link it back + * Create solutions pages? +* Goal of Phases: Make it easy to contribute new ideas and then iterate on top of it diff --git a/docs/about.md b/docs/about.md new file mode 100644 index 0000000000..d2dd8c56e7 --- /dev/null +++ b/docs/about.md @@ -0,0 +1,76 @@ + +# About ECS + +## Scope + +The Elastic Common Schema defines a common set of document fields (and their respective field names) to be used in event messages stored in Elasticsearch as part of any logging or metrics use case of the Elastic Stack, including IT operations analytics and security analytics. + +## Goals + +The ECS has the following goals: + +* Correlate data between metrics, logs and APM +* Correlate data coming from the same machines / hosts +* Correlate data coming from the same service + +Priority on which fields are added is based on these goals. + + +## Benefits + +The benefits to a user adopting these fields and names in their clusters are: + +- Ability to simply correlate data from different data sources +- Improved ability to remember commonly used field names (since there is only a single set, not a set per data source) +- Improved ability to deduce unremembered field names (since the field naming follows a small number of rules with few exceptions) +- Ability to re-use analysis content (searches, visualizations, dashboards, alerts, reports, and ML jobs) across multiple data sources +- Ability to use any future Elastic-provided analysis content in their environment without modifications + + +## FAQ + +### Why is ECS using a dot nation instead of an underline notation? + +There are two common formats on how keys are formatted when ingesting data into Elasticsearch: + +* Dot notation: `user.firstname: Nicolas`, `user.lastname: Ruflin` +* Underline notation: `user_firstname: Nicolas`, `user_lastname: Ruflin` + +In ECS the decision was made to use the dot notation and this entry is intended to share some background on this decision. + +**What is the difference between the two notations?** + +When ingesting `user.firstname` and `user.lastname` it is identical to ingesting the following JSON: + +``` +"user": { + "firstname": "Nicolas", + "lastname": "Ruflin" +} +``` + +This means internally in Elasticsearch `user` is represented as an [object datatype](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/object.html). In the case of the underline notation both are just [string datatypes](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html). + +NOTE: ECS does not used [nested datatypes](https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html) which is an array of objects. + +**Advantages of dot notation** + +The advantage of the dot notation is that on the Elasticsearch side each prefix is an object. Each object can have [parameters](https://www.elastic.co/guide/en/elasticsearch/reference/current/object.html#object-params) on how fields inside the object should be treated, for example if they should be index or mappings should be extended. In the context of ECS this allows for example to disable dynamic property creation for certain prefixes. + +On the ingest side of Elasticsearch it makes it simpler to for example drop complete objects with the remove processor instead of selecting each key inside it. It does not require prior knowledge which keys will end up in the object. + +On the event producing side like in Beats it simplifies the creation of the events as on the code side each object can be treated as an object (or struct in Golang as an example) which makes constructing and modifying each part of the final event easier. + +**Disadvantage of dot notation** + +In Elasticsearch each key can only have one type. So if `user` is an object it's not possible to have in the same index `user` as type `keyword` like `{"user": "nicolas ruflin"}`. This can be an issue in certain datasets. + +For the ECS data itself this is not an issue as all fields are predefined. + +**What if I already use the underline notation?** + +It's not a problem to mix the underline notation with the ECS do notation. They can coexist in the same document as long as there are not conflicts. + +**I have conflicting fields with ECS?** + +Assuming you already have a field user but ECS uses `user` as an object, you can use the [rename processor](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/rename-processor.html) on ingest time to rename your field to either the matching ECS field or rename it to `user.value` instead if your field does not match ECS. diff --git a/docs/implementing.md b/docs/implementing.md new file mode 100644 index 0000000000..ed5f55f148 --- /dev/null +++ b/docs/implementing.md @@ -0,0 +1,35 @@ +# Implementing ECS + +## Adhere to ECS + +The following rules apply if an event wants to adhere to ECS + +* The document MUST have the `@timestamp` field. +* The [data type](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/mapping-types.html) defined for an ECS field MUST be used. +* It SHOULD have the field `event.version` to define which version of ECS it uses. + +To make the most out of ECS as many fields as possible should be mapped to ECS. + +## Rules + +ECS follows the following writing and naming rules for the fields. The goal of +these rules is to make the fields easy to remember and have a guide when new +fields are added. + +Often events will contain additional fields besides ECS. These can follow the +the same naming and writing rules but don't have to. + +**Writing** + +* All fields must be lower case +* No special characters except `_` +* Words are combined through underscore + +**Naming** + +* Use present tense unless field describes historical information. +* Use singular and plural names properly to reflect the field content. For example, use `requests_per_sec` rather than `request_per_sec`. +* Organise the prefixes from general to specific to allow grouping fields into objects with a prefix like `host.*`. +* Avoid stuttering of words. If part of the field name is already in the prefix, do not repeat it. Example: `host.host_ip` should be `host.ip`. +* Fields must be prefixed except for the base fields. For example all `host` fields are prefixed with `host.`. See `dot` notation in FAQ for more details. +* Do not use abbreviations (few exceptions like `ip` exist) diff --git a/docs/intro.md b/docs/intro.md new file mode 100644 index 0000000000..cdb705b535 --- /dev/null +++ b/docs/intro.md @@ -0,0 +1,23 @@ +**WARNING: THIS IS WORK IN PROGRESS** + +# Elastic Common Schema (ECS) + +The Elastic Common Schema (ECS) is used to provide a common data model when +ingesting data into Elasticsearch. Having a common schema allows you correlate +data from sources like logs and metrics or IT operations +analytics and security analytics. + +ECS is still under development and backward compatibility is not guaranteed. Any +feedback on the general structure, missing fields, or existing fields is appreciated. +For contributions please read the [Contributing Guide](CONTRIBUTING.md). + +The current version of ECS is `0.1.0`. + +* [Fields](#fields) +* [Use cases](#use-cases) +* [Implementing ECS](#implementing-ecs) +* [About ECS](#about-ecs) + +# Fields + +List of available ECS fields. diff --git a/docs/use-cases-header.md b/docs/use-cases-header.md new file mode 100644 index 0000000000..52f82bf6c9 --- /dev/null +++ b/docs/use-cases-header.md @@ -0,0 +1,4 @@ +# Use cases + +Below are some examples that demonstrate how ECS fields can be applied to +specific use cases. diff --git a/schema.csv b/schema.csv new file mode 100644 index 0000000000..4e74119163 --- /dev/null +++ b/schema.csv @@ -0,0 +1,140 @@ +Field,Type,Phase,Example +@timestamp,date,3,2016-05-23T08:05:34.853Z +labels,object,0,"{key1: value1, key2: value2}" +message,text,1,Hello World +tags,keyword,0,"[""production"", ""env2""]" +agent.ephemeral_id,keyword,0,8a4f500f +agent.id,keyword,0,8a4f500d +agent.name,keyword,0,filebeat +agent.version,keyword,0,6.0.0-rc2 +cloud.availability_zone,keyword,0,us-east-1c +cloud.instance.id,keyword,0,i-1234567890abcdef0 +cloud.instance.name,keyword,0, +cloud.machine.type,keyword,0,t2.medium +cloud.provider,keyword,0,ec2 +cloud.region,keyword,0,us-east-1 +container.id,keyword,0, +container.image.name,keyword,0, +container.image.tag,keyword,0, +container.labels,object,0, +container.name,keyword,0, +container.runtime,keyword,0,docker +destination.domain,keyword,1, +destination.hostname,keyword,0, +destination.ip,ip,0, +destination.mac,keyword,0, +destination.port,long,0, +destination.subdomain,keyword,1, +device.hostname,keyword,0, +device.ip,ip,0, +device.mac,keyword,0, +device.serial_number,keyword,0, +device.timezone.offset.sec,long,0,-5400 +device.type,keyword,0,firewall +device.vendor,text,0, +device.version,keyword,0, +error.code,keyword,0, +error.id,keyword,0, +error.message,text,0, +event.category,keyword,0,metrics +event.created,date,0, +event.dataset,keyword,0,stats +event.duration,long,0, +event.hash,keyword,1,123456789012345678901234567890ABCD +event.id,keyword,1,8a4f500d +event.module,keyword,0,mysql +event.raw,keyword,1,Sep 19 08:26:10 host CEF:0|Security| threatmanager|1.0|100| worm successfully stopped|10|src=10.0.0.1 dst=2.1.2.2spt=1232 +event.risk_score,float,0, +event.severity,long,1,7 +event.type,keyword,0,nginx-stats-metrics +event.version,keyword,0,0.1.0 +file.ctime,date,0, +file.device,keyword,0, +file.extension,keyword,0,png +file.gid,keyword,0, +file.group,keyword,0, +file.inode,keyword,0, +file.mode,keyword,0,416 +file.mtime,date,0, +file.owner,keyword,0, +file.path,text,0, +file.size,long,0, +file.target_path,text,0, +file.type,keyword,0, +file.uid,keyword,0, +geoip.city_name,keyword,0, +geoip.continent_name,keyword,0, +geoip.country_iso_code,keyword,0, +geoip.location,geo_point,0, +geoip.region_name,keyword,0, +host.architecture,keyword,0,x86_64 +host.id,keyword,1, +host.ip,ip,0, +host.mac,keyword,0, +host.name,keyword,1, +host.os.family,keyword,0,debian +host.os.name,keyword,0,Mac OS X +host.os.platform,keyword,0,darwin +host.os.version,keyword,0,10.12.6 +host.timezone.offset.sec,long,1,-5400 +host.type,keyword,1, +kubernetes.annotations,object,0, +kubernetes.container.name,keyword,0, +kubernetes.labels,object,0, +kubernetes.namespace,keyword,0, +kubernetes.pod.name,keyword,0, +log.level,keyword,0,ERR +log.line,long,0,18 +log.offset,long,0,12 +network.direction,keyword,0,inbound +network.forwarded_ip,ip,0,192.1.1.2 +network.inbound.bytes,long,0,184 +network.inbound.packets,long,0,12 +network.outbound.bytes,long,0,184 +network.outbound.packets,long,0,12 +network.protocol,keyword,0,http +organization.id,keyword,0, +organization.name,text,0, +process.args,keyword,0,"['-l', 'user', '10.0.0.16']" +process.name,keyword,0,ssh +process.pid,long,0, +process.ppid,long,0, +process.title,keyword,0, +service.ephemeral_id,keyword,0,8a4f500f +service.id,keyword,1,d37e5ebfe0ae6c4972dbe9f0174a1637bb8247f6 +service.name,keyword,1,elasticsearch +service.state,keyword,1, +service.type,keyword,1, +service.version,keyword,1,3.2.4 +source.domain,keyword,1, +source.hostname,keyword,0, +source.ip,ip,0, +source.mac,keyword,1, +source.port,long,1, +source.subdomain,keyword,1, +url.extension,keyword,0,png +url.hash,keyword,0, +url.hostname,keyword,0, +url.href,keyword,0,https://elastic.co:443/search?q=elasticsearch#top +url.password,keyword,0, +url.pathname,text,0, +url.port,keyword,0, +url.protocol,keyword,0, +url.search,text,0, +url.username,keyword,0, +user.email,keyword,1, +user.hash,keyword,1, +user.id,keyword,0, +user.name,keyword,0, +user_agent.device,keyword,0, +user_agent.major,long,0, +user_agent.minor,long,0, +user_agent.name,keyword,0,Chrome +user_agent.os.major,long,0, +user_agent.os.minor,long,0, +user_agent.os.name,keyword,0, +user_agent.os.name,keyword,0, +user_agent.os.version,keyword,0, +user_agent.patch,keyword,0, +user_agent.raw,text,0, +user_agent.version,keyword,0, diff --git a/schemas/agent.yml b/schemas/agent.yml new file mode 100644 index 0000000000..2e5480fcbf --- /dev/null +++ b/schemas/agent.yml @@ -0,0 +1,40 @@ +--- +- name: agent + title: Agent + group: 2 + description: > + The agent fields contains the data about the agent/client/shipper that + created the event. + + As an example in case of Beats for logs the `agent.name` is `filebeat`. + In the case of APM it is the agent running in the app / service. The agent + information does not change if data is sent through queuing system like + Kafka, Redis, or processing systems like Logstash or APM Server. + fields: + - name: version + type: keyword + description: > + Agent version. + + example: 6.0.0-rc2 + - name: name + type: keyword + description: > + Agent name. + + Name of the agent. + example: filebeat + - name: id + type: keyword + description: > + Unique identifier of this agent if one exists. + + In the case of Beats this would be beat.id. + example: 8a4f500d + - name: ephemeral_id + type: keyword + description: > + Ephemeral identifier of this agent if one exists. + + This id compared to id normally changes across restarts. + example: 8a4f500f diff --git a/schemas/base.yml b/schemas/base.yml new file mode 100644 index 0000000000..62701b128b --- /dev/null +++ b/schemas/base.yml @@ -0,0 +1,50 @@ +--- +- name: base + title: Base + group: 1 + description: > + The base set contains all fields which are on the top level without + a namespace. + + These are fields which are common across all types of events. + fields: + - name: "@timestamp" + type: date + required: true + phase: 3 + example: "2016-05-23T08:05:34.853Z" + description: > + Timestamp when the event was created. + + For log events this is expected to be when the event was generated and + not when it was read. + + Timestamp is a required field and must exist in all events. + + - name: tags + type: keyword + example: "[\"production\", \"env2\"]" + description: > + Tags is a list of keywords which are used to tag each event. + + - name: labels + type: object + example: "{key1: value1, key2: value2}" + description: > + Labels is an object which contains key/value pairs. + + Labels can be used to add additional meta information to events. Label + should not contain nested objects and all values are stored as keyword. + + An example usage is the docker and k8s labels. + + - name: message + type: text + phase: 1 + example: "Hello World" + description: > + For log events the message field contains the log message. + + In other use cases the message field can be used to concatenate together + different values which are then freely searchable. Or if multiple + messages exist they can be combined here into one message. diff --git a/schemas/cloud.yml b/schemas/cloud.yml new file mode 100644 index 0000000000..d306293b6b --- /dev/null +++ b/schemas/cloud.yml @@ -0,0 +1,49 @@ +--- +- name: cloud + title: Cloud + group: 2 + description: > + All fields related to the cloud or infrastructure the events + are coming from. + + In case Metricbeat is running on an EC2 host and fetches data from its + host, the cloud info is expected to contain the data about this machine. In + the case Metricbeat runs outside the cloud on a remote machine and fetches + data from a service running in the cloud it is expected to have the cloud + data from the machine on which the service is running in. + fields: + - name: provider + example: ec2 + type: keyword + description: > + Name of the cloud provider. Example values are ec2, gce, or + digitalocean. + + - name: availability_zone + example: us-east-1c + type: keyword + description: > + Availability zone in which this host is running. + + - name: region + type: keyword + example: us-east-1 + description: > + Region in which this host is running. + + - name: instance.id + type: keyword + example: i-1234567890abcdef0 + description: > + Instance ID of the host machine. + + - name: instance.name + type: keyword + description: > + Instance name of the host machine. + + - name: machine.type + type: keyword + example: t2.medium + description: > + Machine type of the host machine. diff --git a/schemas/container.yml b/schemas/container.yml new file mode 100644 index 0000000000..47573089c5 --- /dev/null +++ b/schemas/container.yml @@ -0,0 +1,35 @@ +--- +- name: container + title: Container + group: 2 + description: > + Container fields are used for meta information about the specific container + the information is coming from. This should help to correlate data based + containers from any runtime. + fields: + - name: runtime + type: keyword + description: > + Runtime managing this container. + example: docker + - name: id + type: keyword + description: > + Unique container id. + - name: image.name + type: keyword + description: > + Name of the image the container was built on. + - name: image.tag + type: keyword + description: > + Container image tag. + - name: name + type: keyword + description: > + Container name. + - name: labels + type: object + object_type: keyword + description: > + Image labels. diff --git a/schemas/destination.yml b/schemas/destination.yml new file mode 100644 index 0000000000..825bcb2fcb --- /dev/null +++ b/schemas/destination.yml @@ -0,0 +1,36 @@ +--- +- name: destination + title: Destination + group: 2 + description: > + Destination fields describe details about the destination of a + packet/event. + fields: + - name: ip + type: ip + description: > + IP address of the destination. + + This can be on or multiple IPv4 or IPv6 addresses. + - name: hostname + type: keyword + description: > + Hostname of the destination. + - name: port + type: long + description: > + Port of the destination. + - name: mac + type: keyword + description: > + MAC address of the destination. + - name: domain + type: keyword + phase: 1 + description: > + Destination domain. + - name: subdomain + type: keyword + phase: 1 + description: > + Destination subdomain. diff --git a/schemas/device.yml b/schemas/device.yml new file mode 100644 index 0000000000..256f438590 --- /dev/null +++ b/schemas/device.yml @@ -0,0 +1,50 @@ +--- +- name: device + title: Device + group: 2 + description: > + Device fields are used to give additional information about the device + that the information is coming from. + + This could be a firewall, network device, etc. + fields: + - name: mac + type: keyword + description: > + MAC address of the device + - name: ip + type: ip + description: > + IP address of the device. + - name: hostname + type: keyword + description: > + Hostname of the device. + - name: vendor + type: text + description: > + Device vendor information. + - name: version + type: keyword + description: > + Device version. + - name: serial_number + type: keyword + description: > + Device serial number. + - name: timezone.offset.sec + type: long + description: > + Timezone offset of the host in seconds. + + Number of seconds relative to UTC. In case the offset is -01:30 the + value will be -5400. + example: -5400 + - name: type + type: keyword + description: > + The type of the device the data is coming from. + + There is no predefined list of device types. Some examples are + `endpoint`, `firewall`, `ids`, `ips`, `proxy`. + example: firewall diff --git a/schemas/error.yml b/schemas/error.yml new file mode 100644 index 0000000000..829e0fa9fb --- /dev/null +++ b/schemas/error.yml @@ -0,0 +1,25 @@ +--- +- name: error + title: Error + group: 2 + description: > + Error namespace + + This can be used to represent all kinds of errors. It can be for errors that + happen while fetching events or if the event itself contains an error. + + fields: + - name: id + type: keyword + description: > + Unique identifier for the error. + + - name: message + type: text + description: > + Error message. + + - name: code + type: keyword + description: > + Error code describing the error. diff --git a/schemas/event.yml b/schemas/event.yml new file mode 100644 index 0000000000..c81c6791a0 --- /dev/null +++ b/schemas/event.yml @@ -0,0 +1,119 @@ +--- +- name: event + title: Event + group: 2 + description: > + The event fields are used for context information about the data itself. + fields: + - name: id + type: keyword + description: > + Unique ID to describe the event. + example: 8a4f500d + phase: 1 + - name: category + type: keyword + description: > + Event category. + + This can be a user defined category. + example: metrics + + - name: type + type: keyword + description: > + A type given to this kind of event which can be used for grouping. + + This is normally defined by the user. + example: nginx-stats-metrics + + - name: module + type: keyword + description: > + Name of the module this data is coming from. + + This information is coming from the modules used in Beats or Logstash. + example: mysql + + - name: dataset + type: keyword + description: > + Name of the dataset. + + The concept of a `dataset` (fileset / metricset) is used in Beats as a + subset of modules. It contains the information which is currently + stored in metricset.name and metricset.module or fileset.name. + example: stats + + - name: severity + type: long + phase: 1 + example: "7" + description: > + Severity describes the severity of the event. What the different + severity values mean can very different between use cases. It's up to + the implementer to make sure severities are consistent across events. + + - name: raw + type: keyword + phase: 1 + # Unfortunately this example is not shown correctly yet as | do not work + # in tables well + # Is | is the representation of | it works except for cases where + # used ticks. + example: "Sep 19 08:26:10 host CEF:0|Security| + threatmanager|1.0|100| + worm successfully stopped|10|src=10.0.0.1 + dst=2.1.2.2spt=1232" + description: > + Raw text message of entire event to be used to demonstrate log + integrity. + + - name: hash + type: keyword + phase: 1 + example: "123456789012345678901234567890ABCD" + description: > + Hash (perhaps logstash fingerprint) of raw field to be able to + demonstrate log integrity. + + - name: version + type: keyword + required: true + description: > + The version field contains the version an event for ECS adheres to. + + This field should be provided as part of each event to make it possible + to detect to which ECS version an event belongs. + + event.version is a required field and must exist in all events. It + describes which ECS version the event adheres to. + + The current version is 0.1.0. + example: 0.1.0 + + - name: duration + type: long + description: > + Duration of the event in nanoseconds. + + - name: created + type: date + description: > + event.created contains the date when the event was created. + + This timestamp is distinct from @timestamp in that @timestamp contains + the processed timestamp. For logs these two timestamps can be different + as the timestamp in the log line and when the event is read for example + by Filebeat are not identical. `@timestamp` must contain the timestamp + extracted from the log line, event.created when the log line is read. + The same could apply to package capturing where @timestamp contains the + timestamp extracted from the network package and event.created when the + event was created. + + In case the two timestamps are identical, @timestamp should be used. + + - name: risk_score + type: float + description: > + Risk score value of the event. diff --git a/schemas/file.yml b/schemas/file.yml new file mode 100644 index 0000000000..3c88239d91 --- /dev/null +++ b/schemas/file.yml @@ -0,0 +1,83 @@ +--- +- name: file + group: 2 + title: File + description: > + File attributes. + fields: + - name: path + type: text + description: The path to the file. + multi_fields: + - name: raw + type: keyword + description: > + The path to the file. This is a non-analyzed field that is useful + for aggregations. + + - name: target_path + type: text + description: The target path for symlinks. + multi_fields: + - name: raw + type: keyword + description: > + The path to the file. This is a non-analyzed field that is useful + for aggregations. + + - name: extension + type: keyword + description: > + The file extension. + + This should allow easy filtering by file extensions. + example: png + + - name: type + type: keyword + description: The file type (file, dir, or symlink). + + - name: device + type: keyword + description: The device. + + - name: inode + type: keyword + description: The inode representing the file in the filesystem. + + - name: uid + type: keyword + description: > + The user ID (UID) or security identifier (SID) of the file owner. + + - name: owner + type: keyword + description: The file owner's username. + + - name: gid + type: keyword + description: The primary group ID (GID) of the file. + + - name: group + type: keyword + description: The primary group name of the file. + + - name: mode + type: keyword + example: 0640 + description: The mode of the file in octal representation. + + - name: size + type: long + description: The file size in bytes (field is only added when `type` is + `file`). + + - name: mtime + type: date + description: The last modified time of the file (time when content was + modified). + + - name: ctime + type: date + description: The last change time of the file (time when metadata was + changed). diff --git a/schemas/geoip.yml b/schemas/geoip.yml new file mode 100644 index 0000000000..aad389ee28 --- /dev/null +++ b/schemas/geoip.yml @@ -0,0 +1,30 @@ +--- +- name: geoip + title: Geoip + group: 2 + description: > + Geoip fields are for used for geo information for an ip address. + + The conversion to geoip information can be done by the Elasticsearch geoip + plugin. + fields: + - name: continent_name + type: keyword + description: > + The name of the continent. + - name: country_iso_code + type: keyword + description: > + Country ISO code. + - name: location + type: geo_point + description: > + The longitude and latitude. + - name: region_name + type: keyword + description: > + The region name. + - name: city_name + type: keyword + description: > + The city name. diff --git a/schemas/host.yml b/schemas/host.yml new file mode 100644 index 0000000000..afb098ee71 --- /dev/null +++ b/schemas/host.yml @@ -0,0 +1,88 @@ +--- +- name: host + title: Host + group: 2 + description: > + All fields related to a host. A host can be a physical machine, a virtual + machine, and also a Docker container. + + Normally the host information is related to the machine on which the event + was generated / collected but also can be used differently if needed. + fields: + - name: timezone.offset.sec + type: long + description: > + Timezone offset of the host in seconds. + + Number of seconds relative to UTC. In case the offset is -01:30 the + value will be -5400. + phase: 1 + example: -5400 + + - name: name + type: keyword + description: > + host.name is the hostname of the host. + + It can contain what `hostname` returns on Unix systems, the fully + qualified domain name or also a name specified by the user. It is + up to the sender to decide which value to use. + phase: 1 + + - name: id + type: keyword + phase: 1 + description: > + Unique host id. + + As hostname is not always unique, this often can be configured by the + user. An example here is the current usage of `beat.name`. + - name: ip + type: ip + description: > + Host ip address. + + - name: mac + type: keyword + description: > + Host mac address. + + - name: type + type: keyword + description: > + This is the type of the host. + + For Cloud providers this can be the machine type like `t2.medium`. + Or it vm, container for example or something user defined. + phase: 1 + + # Operating System information + - name: os.platform + type: keyword + description: > + Operating system platform (e.g. centos, ubuntu, windows). + example: darwin + + - name: os.name + type: keyword + example: "Mac OS X" + description: > + Operating system name. + + - name: os.family + type: keyword + example: "debian" + description: > + OS family (e.g. redhat, debian, freebsd, windows). + + - name: os.version + type: keyword + example: "10.12.6" + description: > + Operating system version. + + - name: architecture + type: keyword + example: "x86_64" + description: > + Operating system architecture. diff --git a/schemas/kubernetes.yml b/schemas/kubernetes.yml new file mode 100644 index 0000000000..9d4bfd0b51 --- /dev/null +++ b/schemas/kubernetes.yml @@ -0,0 +1,34 @@ +--- +- name: kubernetes + title: Kubernetes + group: 2 + description: > + Kubernetes fields are used for meta information about k8s. This should help + to correlate data coming out of k8s setups. + + fields: + - name: pod.name + type: keyword + description: > + Kubernetes pod name + + - name: namespace + type: keyword + description: > + Kubernetes namespace + + - name: labels + type: object + description: > + Kubernetes labels map + + - name: annotations + type: object + description: > + Kubernetes annotations map + + - name: container.name + type: keyword + description: > + Kubernetes container name. This name is unique within the pod only, + it's different from underlying container name (container.name in ECS) diff --git a/schemas/log.yml b/schemas/log.yml new file mode 100644 index 0000000000..2c6047c586 --- /dev/null +++ b/schemas/log.yml @@ -0,0 +1,23 @@ +--- +- name: log + title: Log + description: > + Fields which are specific to log events. + fields: + - name: level + type: keyword + description: > + Log level of the log event. + + Some examples are `WARN`, `ERR`, `INFO`. + example: ERR + - name: line + type: long + description: > + Line number the log event was collected from. + example: 18 + - name: offset + type: long + description: > + Offset of the beginning of the log event. + example: 12 diff --git a/schemas/network.yml b/schemas/network.yml new file mode 100644 index 0000000000..afc422d2a0 --- /dev/null +++ b/schemas/network.yml @@ -0,0 +1,51 @@ +--- +- name: network + title: Network + group: 2 + description: > + All fields related to network data. + fields: + - name: protocol + type: keyword + description: > + Network protocol name. + example: http + - name: direction + type: keyword + description: > + Direction of the network traffic. + + The recommended values are: + * inbound + * outbound + * unknown + example: inbound + + - name: forwarded_ip + type: ip + description: > + forwarded_ip indicates the host IP address when the source IP address + is the proxy. + example: 192.1.1.2 + + # Metrics + - name: inbound.bytes + type: long + description: > + Network inbound bytes. + example: 184 + - name: inbound.packets + type: long + description: > + Network inbound packets. + example: 12 + - name: outbound.bytes + type: long + description: > + Network outbound bytes. + example: 184 + - name: outbound.packets + type: long + description: > + Network outbound packets. + example: 12 diff --git a/schemas/organization.yml b/schemas/organization.yml new file mode 100644 index 0000000000..797773fffa --- /dev/null +++ b/schemas/organization.yml @@ -0,0 +1,20 @@ +--- +- name: organization + title: Organization + group: 2 + description: > + The organization namespace can be used to enrich data with information from + which organization the data belongs. + + This can be useful if data should stored in the same index should be + sometimes filtered or organized by one or multiple organizations. + fields: + - name: name + type: text + description: > + Organization name. + + - name: id + type: keyword + description: > + Unique identifier for the organization. diff --git a/schemas/process.yml b/schemas/process.yml new file mode 100644 index 0000000000..56b6890814 --- /dev/null +++ b/schemas/process.yml @@ -0,0 +1,42 @@ +--- +- name: process + title: Process + group: 2 + description: > + These fields contain information about a process. + + If metrics information is collected for a process and a process id / name + shows up in a log message, these fields should help to correlated the two. + It is expected that the `process.pid` will often also stay in the metric + itself and only copied to the global field for correlation. + fields: + - name: args + type: keyword + description: > + Process arguments. + + May be filtered to protect sensitive information. + example: ["-l", "user", "10.0.0.16"] + - name: name + type: keyword + description: > + Process name. + + This is sometimes also known as program name or similar. + example: ssh + - name: pid + type: long + description: > + Process id. + exmple: ssh + - name: ppid + type: long + description: > + Process parent id. + + - name: title + type: keyword + description: > + Process title. + + The proctitle, often the same as process name. diff --git a/schemas/service.yml b/schemas/service.yml new file mode 100644 index 0000000000..4111bd057a --- /dev/null +++ b/schemas/service.yml @@ -0,0 +1,63 @@ +--- +- name: service + title: Service + group: 2 + description: > + The service fields describe the service for / from which the data was + collected. + + If logs or metrics are collected from Redis, `service.name` would be + `redis`. This allows to find and correlate logs for a specific service and + even version with `service.version`. + + fields: + - name: id + type: keyword + description: > + Unique identifier of the running service. + + This id should uniquely identify this service. This makes it possible + to correlate logs and metrics for one specific service. For example + in case of issues with one redis instance, it's possible to filter on + the id to see metrics and logs for this single instance. + + example: d37e5ebfe0ae6c4972dbe9f0174a1637bb8247f6 + phase: 1 + + - name: name + type: keyword + phase: 1 + example: "elasticsearch" + description: > + Name of the service data is collected from. + + The name can be used to group logs and metrics together from one + service and correlate them. + + - name: type + type: keyword + phase: 1 + description: > + Service type. + + - name: state + type: keyword + phase: 1 + description: > + Current state of the service. + - name: version + type: keyword + phase: 1 + example: "3.2.4" + description: > + Version of the service the data was collected from. + + This allows to look at a data set only for a specific version of a + service. + - name: ephemeral_id + type: keyword + description: > + Ephemeral identifier of this service if one exists. + + This id compared to id normally changes across restarts. + example: 8a4f500f diff --git a/schemas/source.yml b/schemas/source.yml new file mode 100644 index 0000000000..e93f55fd59 --- /dev/null +++ b/schemas/source.yml @@ -0,0 +1,38 @@ +--- +- name: source + title: Source + group: 2 + description: > + Source fields describe details about the source of where the event is + coming from. + fields: + - name: ip + type: ip + description: > + IP address of the source. + + This can be on or multiple IPv4 or IPv6 addresses. + - name: hostname + type: keyword + description: > + Hostname of the source. + - name: port + type: long + phase: 1 + description: > + Port of the source. + - name: mac + type: keyword + phase: 1 + description: > + MAC address of the source. + - name: domain + type: keyword + phase: 1 + description: > + Source domain. + - name: subdomain + type: keyword + phase: 1 + description: > + Source subdomain. diff --git a/schemas/url.yml b/schemas/url.yml new file mode 100644 index 0000000000..6bcbd49b15 --- /dev/null +++ b/schemas/url.yml @@ -0,0 +1,89 @@ +--- +- name: url + title: URL + description: > + A complete URL, with scheme, host, and path. + + The URL object can be reused in other prefixes like `host.url.*` for + example. It is important that whenever URL is used that the same structure + is used. + + `url.href` is a [multi field](https://www.elastic.co/guide/en/elasticsearch/reference/6.2/multi-fields.html#_multi_fields_with_multiple_analyzers) + which means the data is stored as keyword `url.href` and test + `url.href.analyzed`. The advantage of this is that for running a query + against only a part of the url still works without having to split up the + URL in all its part on ingest time. + + Based on whatwg URL definition: https://github.com/whatwg/url/issues/337 + fields: + - name: href + type: keyword + description: > + href contains the full url. The field is stored as keyword. + + `href` is an analyzed field so the parsed information can be accessed + through `href.analyzed` in queries. + multi_fields: + - name: analyzed + type: text + example: https://elastic.co:443/search?q=elasticsearch#top + - name: protocol + type: keyword + description: > + The protocol of the request, e.g. "https:". + - name: hostname + type: keyword + description: > + The hostname of the request, e.g. "example.com". + + For correlation the this field can be copied into the `host.name` + field. + - name: port + type: keyword + description: > + The port of the request, e.g. 443. + - name: pathname + type: text + description: > + The path of the request, e.g. "/search". + multi_fields: + - name: raw + type: keyword + description: > + The url path. This is a non-analyzed field that is useful + for aggregations. + - name: search + type: text + description: > + The search describes the query string of the request, + e.g. "q=elasticsearch". + multi_fields: + - name: raw + type: keyword + description: > + The url search part. This is a non-analyzed field that is useful + for aggregations. + - name: hash + type: keyword + description: > + The hash of the request URL, e.g. "top". + - name: username + type: keyword + description: > + The username of the request. + - name: password + type: keyword + description: > + The password of the request. + - name: extension + type: keyword + description: > + The url extension field contains the extension of the file associated with + the url. + + A simple example is `http://localhost/logo.png` where the extension would be `png`. + There can also be more complex cases like `http://localhost/content?asset=logo.png&token=XYZ` + where the extension could also be `png` but depends on the implementation. + + The `extension` field should be left out if the extension is not defined. + example: png diff --git a/schemas/user.yml b/schemas/user.yml new file mode 100644 index 0000000000..50fa78de5f --- /dev/null +++ b/schemas/user.yml @@ -0,0 +1,33 @@ +--- +- name: user + title: User + group: 2 + description: > + The user fields are used to describe user information as part of the event. + + All fields in user can have one or multiple entries. If a user has more + then one id, an array with the ids must be provided. + fields: + - name: id + type: keyword + description: > + One or multiple unique identifiers of the user. + - name: name + type: keyword + description: > + Name of the user. + + As the field is a keyword, the field will not be tokenized. + - name: email + type: keyword + phase: 1 + description: > + User email address. + - name: hash + type: keyword + phase: 1 + description: > + Unique user hash to correlate information for a user in anonymized form. + + This is useful in case `user.id` or `user.name` cannot be used because + it contains confidential information. diff --git a/schemas/user_agent.yml b/schemas/user_agent.yml new file mode 100644 index 0000000000..fea3c88c21 --- /dev/null +++ b/schemas/user_agent.yml @@ -0,0 +1,59 @@ +--- +- name: user_agent + title: User agent + group: 2 + description: > + The user_agent fields are normally coming from a browser request. + + These are common to show up in web service logs coming from the parsed user + agent string. + fields: + - name: raw + type: text + description: > + Unparsed version of the user_agent. + - name: device + type: keyword + description: > + The name of the physical device. + - name: version + type: keyword + description: > + Version of the physical device. + - name: major + type: long + description: > + The major version of the user agent. + - name: minor + type: long + description: > + The minor version of the user agent. + - name: patch + type: keyword + description: > + The patch version of the user agent. + - name: name + type: keyword + example: Chrome + description: > + The name of the user agent. + - name: os.name + type: keyword + description: > + The name of the operating system. + - name: os.version + type: keyword + description: > + Version of the operating system. + - name: os.major + type: long + description: > + The major version of the operating system. + - name: os.minor + type: long + description: > + The minor version of the operating system. + - name: os.name + type: keyword + description: > + The name of the operating system. diff --git a/scripts/helper.py b/scripts/helper.py new file mode 100644 index 0000000000..909178b5ae --- /dev/null +++ b/scripts/helper.py @@ -0,0 +1,139 @@ +import yaml + + +def read_schema_file(path): + """Read a schema.yml file and cleans up the fields + """ + fields = [] + with open(path) as f: + fields = yaml.load(f.read()) + + clean_fields(fields) + return fields + + +def read_use_case_file(path): + """Read a use-case.yml file and cleans up the fields + """ + with open(path) as f: + use_case = yaml.load(f.read()) + + fields = use_case["fields"] + clean_fields(fields) + use_case["fields"] = fields + return use_case + + +def clean_fields(fields): + """Cleans up all fields to set defaults + """ + for namespace in fields: + + # For now set the default group to 2 + if "group" not in namespace: + namespace["group"] = 2 + + for field in namespace["fields"]: + clean_string_field(field, "description") + clean_string_field(field, "example") + clean_string_field(field, "type") + + # Prefix if not base namespace + if namespace["name"] != "base": + field["name"] = namespace["name"] + "." + field["name"] + + if 'phase' not in field.keys(): + field["phase"] = 0 + + if 'group' not in field.keys(): + # If no group set, set parent group + field["group"] = namespace["group"] + + if "multi_fields" in field: + for f in field["multi_fields"]: + clean_string_field(f, "description") + clean_string_field(f, "example") + clean_string_field(f, "type") + + # Prefix if not base namespace + if namespace["name"] != "base": + f["name"] = field["name"] + "." + f["name"] + + if 'phase' not in f.keys(): + f["phase"] = 0 + + if 'group' not in f.keys(): + # If no group set, set parent group + f["group"] = namespace["group"] + + +def clean_string_field(field, key): + """Cleans a string field and creates an empty string for the field in case it does not exist + """ + if key in field.keys(): + # Remove all spaces and newlines from beginning and end + field[key] = str(field[key]).strip() + else: + field[key] = "" + + +def get_markdown_row(field, link, multi_field): + """Creates a markdown table for the given fields + """ + + # Replace newlines with HTML representation as otherwise newlines don't work in Markdown + description = field["description"].replace("\n", "
") + + # Verified and accepted fields are bold + verified = False + if 'verified' in field.keys() and field["verified"]: + field["name"] = "**" + field["name"] + "**" + + example = "" + if field["example"] != "": + # Add ticks around examples to not break table + example = "`{}`".format(field["example"]) + + if multi_field: + multi_field = "1" + else: + multi_field = "" + + # If link is true, it link to the anchor is provided. This is used for the use-cases + if link: + return '| [`{}`]({}#{}) | {} | {} | {} | {} |\n'.format(field["name"], link, field["name"], description, field["type"], multi_field, example) + + # By default a anchor is attached to the name + return '| `{}` | {} | {} | {} | {} |\n'.format(field["name"], field["name"], description, field["type"], multi_field, example) + + +def get_markdown_table(namespace, title_prefix="##", link=False): + + output = '{} {} fields\n\n'.format(title_prefix, namespace["name"], namespace["title"]) + + # Replaces one newlines with two as otherwise double newlines do not show up in markdown + output += namespace["description"].replace("\n", "\n\n") + "\n" + + titles = ["Field", "Description", "Type", "Multi Field", "Example"] + + for title in titles: + output += "| {} ".format(title) + output += "|\n" + + for title in titles: + output += "|---" + output += "|\n" + + # Sort fields for easier readability + namespaceFields = sorted(namespace["fields"], key=lambda field: field["name"]) + + # Print fields into a table + for field in namespace["fields"]: + output += get_markdown_row(field, link, False) + if "multi_fields" in field: + for f in field["multi_fields"]: + output += get_markdown_row(f, link, True) + + output += "\n\n" + + return output diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000000..5500f007d0 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +PyYAML diff --git a/scripts/schemas.py b/scripts/schemas.py new file mode 100644 index 0000000000..77b9507d68 --- /dev/null +++ b/scripts/schemas.py @@ -0,0 +1,104 @@ +import csv +import os +import yaml +import sys +import copy +from helper import * +import argparse + + +def create_csv(fields, file): + + open_mode = "wb" + if sys.version_info >= (3, 0): + open_mode = "w" + + # Create markdown schema output file + with open(file, open_mode) as csvfile: + schema_writer = csv.writer(csvfile, + delimiter=',', + quoting=csv.QUOTE_MINIMAL, + lineterminator='\n') + schema_writer.writerow(["Field", "Type", "Phase", "Example"]) + + for namespace in fields: + if len(namespace["fields"]) == 0: + continue + + # Sort fields for easier readability + namespaceFields = sorted(namespace["fields"], + key=lambda field: field["name"]) + + # Print fields into a table + for field in namespaceFields: + schema_writer.writerow([field["name"], field["type"], field["phase"], field["example"]]) + + +def create_markdown(fields, file): + # Create markdown schema output file + output = open(file, 'w') + + for namespace in fields: + if len(namespace["fields"]) == 0: + continue + output.write(get_markdown_table(namespace)) + + output.close() + + +def create_markdown_string(fields): + # Create markdown schema output string + output = "" + + links = "" + for namespace in fields: + if len(namespace["fields"]) == 0: + continue + # Link list to field prefixes + links += " * [{} fields](#{})\n".format(namespace["title"], namespace["name"]) + output += get_markdown_table(namespace) + + output = links + "\n" + output + "\n\n" + return output + + +def filtered_fields(fields, groups): + new_fields = copy.deepcopy(fields) + for f in new_fields: + n = 0 + for field in list(f["fields"]): + if field["group"] not in groups: + del f["fields"][n] + continue + n = n + 1 + + return new_fields + + +if __name__ == "__main__": + + # Load schema files into yaml + files = os.listdir("./schemas") + + fields = [] + for file in sorted(os.listdir("schemas")): + fields = fields + read_schema_file("schemas/" + file) + + # Load all fields into object + sortedNamespaces = sorted(fields, key=lambda field: field["group"]) + + parser = argparse.ArgumentParser() + parser.add_argument('--stdout', help='output to stdout instead of files') + args = parser.parse_args() + + if args.stdout == "true": + groups = [1, 2, 3] + f_fields = filtered_fields(sortedNamespaces, groups) + # Print to stdout + print create_markdown_string(f_fields) + + else: + groups = [1, 2, 3] + f_fields = filtered_fields(sortedNamespaces, groups) + #create_markdown(f_fields, "schema.md") + create_csv(f_fields, "schema.csv") diff --git a/scripts/template.go b/scripts/template.go new file mode 100644 index 0000000000..3388aee722 --- /dev/null +++ b/scripts/template.go @@ -0,0 +1,64 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/elastic/beats/libbeat/common" + "github.com/elastic/beats/libbeat/template" + "github.com/elastic/go-ucfg/yaml" +) + +func main() { + + // For the path tob e correct, the execution must be from the top directory + paths, err := filepath.Glob("./schemas/*") + if err != nil { + fmt.Printf("Error: %s \n", err) + os.Exit(1) + } + + fields := common.Fields{} + + for _, path := range paths { + f := common.Fields{} + + cfg, err := yaml.NewConfigWithFile(path) + if err != nil { + fmt.Printf("Error: %s \n", err) + os.Exit(1) + } + cfg.Unpack(&f) + + for key, f2 := range f { + // The definitions don't have the type group in and the template + // generator assumes otherwise keyword as default + f[key].Type = "group" + + // Moves the docs under base to the top level + if f2.Name == "base" { + f = f2.Fields + } + } + + fields = append(fields, f...) + } + + t, err := template.New("1.0.0", "ecs", "6.0.0", template.TemplateConfig{}) + if err != nil { + fmt.Printf("Error: %s \n", err) + os.Exit(1) + } + + // Start processing at the root + properties := common.MapStr{} + processor := template.Processor{} + if err := processor.Process(fields, "", properties); err != nil { + fmt.Printf("Error: %s \n", err) + os.Exit(1) + } + output := t.Generate(properties, nil) + + fmt.Printf("%s", output.StringToPrint()) +} diff --git a/scripts/use-cases.py b/scripts/use-cases.py new file mode 100644 index 0000000000..6b5b8fae74 --- /dev/null +++ b/scripts/use-cases.py @@ -0,0 +1,63 @@ +import yaml +import os +import argparse +from helper import * +import os.path + + +def write_stdout(): + + link_prefix = "https://github.com/elastic/ecs" + + links = "" + for file in os.listdir("./use-cases"): + + output = "" + + if not file.endswith(".yml"): + continue + + use_case = read_use_case_file("./use-cases/" + file) + + schema_link = "https://github.com/elastic/ecs/blob/master/use-cases/" + # Link list to field prefixes + links += " * [{}]({}{}.md)\n".format(use_case["title"], schema_link, use_case["name"]) + + output += "## {} use case\n\n".format(use_case["title"]) + output += "{}\n\n".format(use_case["description"]) + + fields = [] + for f in use_case["fields"]: + # In case a description exists for a prefix, add is as field with .* + if "description" in f and f["description"] != "": + fields.append({ + "name": f["name"] + ".*", + "description": f["description"], + "type": "", + "phase": "", + "example": "", + }) + + for f2 in f["fields"]: + fields.append(f2) + + global_fields = {"name": use_case["name"], "title": use_case["title"], "description": "", "fields": fields} + output += get_markdown_table(global_fields, "###", link_prefix) + "\n" + + # Write output to /use-cases/use_case["name"].md file + # Adjust links + + with open("./use-cases/" + use_case["name"] + ".md", "w") as f: + f.write(output) + + print "\n" + links + "\n\n" + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--stdout', help='output to stdout instead of files') + args = parser.parse_args() + + if args.stdout == "true": + write_stdout() diff --git a/template.json b/template.json new file mode 100644 index 0000000000..2934113513 --- /dev/null +++ b/template.json @@ -0,0 +1,717 @@ +{ + "index_patterns": [ + "ecs-1.0.0-*" + ], + "mappings": { + "doc": { + "_meta": { + "version": "1.0.0" + }, + "date_detection": false, + "dynamic_templates": [ + { + "strings_as_keyword": { + "mapping": { + "ignore_above": 1024, + "type": "keyword" + }, + "match_mapping_type": "string" + } + } + ], + "properties": { + "@timestamp": { + "type": "date" + }, + "agent": { + "properties": { + "ephemeral_id": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "version": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "cloud": { + "properties": { + "availability_zone": { + "ignore_above": 1024, + "type": "keyword" + }, + "instance": { + "properties": { + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "machine": { + "properties": { + "type": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "provider": { + "ignore_above": 1024, + "type": "keyword" + }, + "region": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "container": { + "properties": { + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "image": { + "properties": { + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "tag": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "labels": { + "type": "object" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "runtime": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "destination": { + "properties": { + "domain": { + "ignore_above": 1024, + "type": "keyword" + }, + "hostname": { + "ignore_above": 1024, + "type": "keyword" + }, + "ip": { + "type": "ip" + }, + "mac": { + "ignore_above": 1024, + "type": "keyword" + }, + "port": { + "type": "long" + }, + "subdomain": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "device": { + "properties": { + "hostname": { + "ignore_above": 1024, + "type": "keyword" + }, + "ip": { + "type": "ip" + }, + "mac": { + "ignore_above": 1024, + "type": "keyword" + }, + "serial_number": { + "ignore_above": 1024, + "type": "keyword" + }, + "timezone": { + "properties": { + "offset": { + "properties": { + "sec": { + "type": "long" + } + } + } + } + }, + "type": { + "ignore_above": 1024, + "type": "keyword" + }, + "vendor": { + "norms": false, + "type": "text" + }, + "version": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "error": { + "properties": { + "code": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "message": { + "norms": false, + "type": "text" + } + } + }, + "event": { + "properties": { + "category": { + "ignore_above": 1024, + "type": "keyword" + }, + "created": { + "type": "date" + }, + "dataset": { + "ignore_above": 1024, + "type": "keyword" + }, + "duration": { + "type": "long" + }, + "hash": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "module": { + "ignore_above": 1024, + "type": "keyword" + }, + "raw": { + "ignore_above": 1024, + "type": "keyword" + }, + "risk_score": { + "type": "float" + }, + "severity": { + "type": "long" + }, + "type": { + "ignore_above": 1024, + "type": "keyword" + }, + "version": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "file": { + "properties": { + "ctime": { + "type": "date" + }, + "device": { + "ignore_above": 1024, + "type": "keyword" + }, + "extension": { + "ignore_above": 1024, + "type": "keyword" + }, + "gid": { + "ignore_above": 1024, + "type": "keyword" + }, + "group": { + "ignore_above": 1024, + "type": "keyword" + }, + "inode": { + "ignore_above": 1024, + "type": "keyword" + }, + "mode": { + "ignore_above": 1024, + "type": "keyword" + }, + "mtime": { + "type": "date" + }, + "owner": { + "ignore_above": 1024, + "type": "keyword" + }, + "path": { + "fields": { + "raw": { + "ignore_above": 1024, + "type": "keyword" + } + }, + "norms": false, + "type": "text" + }, + "size": { + "type": "long" + }, + "target_path": { + "fields": { + "raw": { + "ignore_above": 1024, + "type": "keyword" + } + }, + "norms": false, + "type": "text" + }, + "type": { + "ignore_above": 1024, + "type": "keyword" + }, + "uid": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "geoip": { + "properties": { + "city_name": { + "ignore_above": 1024, + "type": "keyword" + }, + "continent_name": { + "ignore_above": 1024, + "type": "keyword" + }, + "country_iso_code": { + "ignore_above": 1024, + "type": "keyword" + }, + "location": { + "type": "geo_point" + }, + "region_name": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "host": { + "properties": { + "architecture": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "ip": { + "type": "ip" + }, + "mac": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "os": { + "properties": { + "family": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "platform": { + "ignore_above": 1024, + "type": "keyword" + }, + "version": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "timezone": { + "properties": { + "offset": { + "properties": { + "sec": { + "type": "long" + } + } + } + } + }, + "type": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "kubernetes": { + "properties": { + "annotations": { + "type": "object" + }, + "container": { + "properties": { + "name": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "labels": { + "type": "object" + }, + "namespace": { + "ignore_above": 1024, + "type": "keyword" + }, + "pod": { + "properties": { + "name": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + }, + "labels": { + "type": "object" + }, + "log": { + "properties": { + "level": { + "ignore_above": 1024, + "type": "keyword" + }, + "line": { + "type": "long" + }, + "offset": { + "type": "long" + } + } + }, + "message": { + "norms": false, + "type": "text" + }, + "network": { + "properties": { + "direction": { + "ignore_above": 1024, + "type": "keyword" + }, + "forwarded_ip": { + "type": "ip" + }, + "inbound": { + "properties": { + "bytes": { + "type": "long" + }, + "packets": { + "type": "long" + } + } + }, + "outbound": { + "properties": { + "bytes": { + "type": "long" + }, + "packets": { + "type": "long" + } + } + }, + "protocol": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "organization": { + "properties": { + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "norms": false, + "type": "text" + } + } + }, + "process": { + "properties": { + "args": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "pid": { + "type": "long" + }, + "ppid": { + "type": "long" + }, + "title": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "service": { + "properties": { + "ephemeral_id": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "state": { + "ignore_above": 1024, + "type": "keyword" + }, + "type": { + "ignore_above": 1024, + "type": "keyword" + }, + "version": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "source": { + "properties": { + "domain": { + "ignore_above": 1024, + "type": "keyword" + }, + "hostname": { + "ignore_above": 1024, + "type": "keyword" + }, + "ip": { + "type": "ip" + }, + "mac": { + "ignore_above": 1024, + "type": "keyword" + }, + "port": { + "type": "long" + }, + "subdomain": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "tags": { + "ignore_above": 1024, + "type": "keyword" + }, + "url": { + "properties": { + "extension": { + "ignore_above": 1024, + "type": "keyword" + }, + "hash": { + "ignore_above": 1024, + "type": "keyword" + }, + "hostname": { + "ignore_above": 1024, + "type": "keyword" + }, + "href": { + "fields": { + "analyzed": { + "norms": false, + "type": "text" + } + }, + "ignore_above": 1024, + "type": "keyword" + }, + "password": { + "ignore_above": 1024, + "type": "keyword" + }, + "pathname": { + "fields": { + "raw": { + "ignore_above": 1024, + "type": "keyword" + } + }, + "norms": false, + "type": "text" + }, + "port": { + "ignore_above": 1024, + "type": "keyword" + }, + "protocol": { + "ignore_above": 1024, + "type": "keyword" + }, + "search": { + "fields": { + "raw": { + "ignore_above": 1024, + "type": "keyword" + } + }, + "norms": false, + "type": "text" + }, + "username": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "user": { + "properties": { + "email": { + "ignore_above": 1024, + "type": "keyword" + }, + "hash": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "user_agent": { + "properties": { + "device": { + "ignore_above": 1024, + "type": "keyword" + }, + "major": { + "type": "long" + }, + "minor": { + "type": "long" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "os": { + "properties": { + "major": { + "type": "long" + }, + "minor": { + "type": "long" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "version": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "patch": { + "ignore_above": 1024, + "type": "keyword" + }, + "raw": { + "norms": false, + "type": "text" + }, + "version": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + } + }, + "order": 1, + "settings": { + "index": { + "mapping": { + "total_fields": { + "limit": 10000 + } + }, + "refresh_interval": "5s" + } + } +} \ No newline at end of file diff --git a/use-cases/README.md b/use-cases/README.md new file mode 100644 index 0000000000..24ea710ecb --- /dev/null +++ b/use-cases/README.md @@ -0,0 +1,11 @@ +# Use cases + +The use cases directory is used to define the fields for some more specific use cases. All the fields used here are inherited from ECS but are referenced here to have more details on it. + +## Generate + +To generate the markdown from the use-cases run + +``` +python use-cases.py +``` diff --git a/use-cases/apm.md b/use-cases/apm.md new file mode 100644 index 0000000000..2e45063068 --- /dev/null +++ b/use-cases/apm.md @@ -0,0 +1,21 @@ +## APM use case + +ECS usage for the APM data. + +### APM fields + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| [`id`](https://github.com/elastic/ecs#id) | Unique id to describe the event. | keyword | | `8a4f500d` | +| [`timestamp`](https://github.com/elastic/ecs#timestamp) | Timestamp when the event was created in the app / service. | date | | `2016-05-23T08:05:34.853Z` | +| [`agent.*`](https://github.com/elastic/ecs#agent.*) | The agent fields are used to describe which agent did send the information.
| | | | +| [`agent.version`](https://github.com/elastic/ecs#agent.version) | APM Agent version. | keyword | | `3.14.0` | +| [`agent.name`](https://github.com/elastic/ecs#agent.name) | APM agent name. | keyword | | `elastic-node` | +| [`service.*`](https://github.com/elastic/ecs#service.*) | The service fields describe the service inside which the APM agent is running.
| | | | +| [`service.id`](https://github.com/elastic/ecs#service.id) | Unique identifier of the running service. | keyword | | `d37e5ebfe0ae6c4972dbe9f0174a1637bb8247f6` | +| [`service.name`](https://github.com/elastic/ecs#service.name) | Name of the service the agent is running in. This is normally a user defined name. | keyword | | `user-service` | +| [`service.version`](https://github.com/elastic/ecs#service.version) | Version of the service the agent is running in. This depends on if the service is given a version. | keyword | | `3.2.4` | + + + diff --git a/use-cases/apm.yml b/use-cases/apm.yml new file mode 100644 index 0000000000..419d27a8d4 --- /dev/null +++ b/use-cases/apm.yml @@ -0,0 +1,57 @@ +title: APM +name: apm +description: + ECS usage for the APM data. +fields: +- name: base + fields: + - name: id + type: keyword + description: > + Unique id to describe the event. + example: 8a4f500d + - name: timestamp + type: date + phase: 1 + example: "2016-05-23T08:05:34.853Z" + description: > + Timestamp when the event was created in the app / service. + +- name: agent + description: > + The agent fields are used to describe which agent did send the information. + fields: + - name: version + type: keyword + description: > + APM Agent version. + example: 3.14.0 + - name: name + type: keyword + description: > + APM agent name. + example: elastic-node + +- name: service + description: > + The service fields describe the service inside which the APM agent is running. + fields: + - name: id + type: keyword + description: > + Unique identifier of the running service. + example: d37e5ebfe0ae6c4972dbe9f0174a1637bb8247f6 + + - name: name + type: keyword + example: "user-service" + description: > + Name of the service the agent is running in. This is normally a + user defined name. + + - name: version + type: keyword + example: "3.2.4" + description: > + Version of the service the agent is running in. This depends + on if the service is given a version. diff --git a/use-cases/auditbeat.md b/use-cases/auditbeat.md new file mode 100644 index 0000000000..e477c5da82 --- /dev/null +++ b/use-cases/auditbeat.md @@ -0,0 +1,28 @@ +## Auditbeat use case + +ECS usage in Auditbeat. + +### Auditbeat fields + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| [`event.module`](https://github.com/elastic/ecs#event.module) | Auditbeat module name. | | | | +| [`file.*`](https://github.com/elastic/ecs#file.*) | File attributes.
| | | | +| [`file.path`](https://github.com/elastic/ecs#file.path) | The path to the file. | text | | | +| [`file.path.raw`](https://github.com/elastic/ecs#file.path.raw) | The path to the file. This is a non-analyzed field that is useful for aggregations. | keyword | 1 | | +| [`file.target_path`](https://github.com/elastic/ecs#file.target_path) | The target path for symlinks. | keyword | | | +| [`file.type`](https://github.com/elastic/ecs#file.type) | The file type (file, dir, or symlink). | keyword | | | +| [`file.device`](https://github.com/elastic/ecs#file.device) | The device. | keyword | | | +| [`file.inode`](https://github.com/elastic/ecs#file.inode) | The inode representing the file in the filesystem. | keyword | | | +| [`file.uid`](https://github.com/elastic/ecs#file.uid) | The user ID (UID) or security identifier (SID) of the file owner. | keyword | | | +| [`file.owner`](https://github.com/elastic/ecs#file.owner) | The file owner's username. | keyword | | | +| [`file.gid`](https://github.com/elastic/ecs#file.gid) | The primary group ID (GID) of the file. | keyword | | | +| [`file.group`](https://github.com/elastic/ecs#file.group) | The primary group name of the file. | keyword | | | +| [`file.mode`](https://github.com/elastic/ecs#file.mode) | The mode of the file in octal representation. | keyword | | `416` | +| [`file.size`](https://github.com/elastic/ecs#file.size) | The file size in bytes (field is only added when `type` is `file`). | long | | | +| [`file.mtime`](https://github.com/elastic/ecs#file.mtime) | The last modified time of the file (time when content was modified). | date | | | +| [`file.ctime`](https://github.com/elastic/ecs#file.ctime) | The last change time of the file (time when metadata was changed). | date | | | + + + diff --git a/use-cases/auditbeat.yml b/use-cases/auditbeat.yml new file mode 100644 index 0000000000..113ecbdc29 --- /dev/null +++ b/use-cases/auditbeat.yml @@ -0,0 +1,147 @@ +title: Auditbeat +name: auditbeat +description: + ECS usage in Auditbeat. +fields: +- name: event + fields: + - name: module + description: > + Auditbeat module name. +- name: file + title: File + description: > + File attributes. + fields: + - name: path + type: text + description: The path to the file. + multi_fields: + - name: raw + type: keyword + description: > + The path to the file. This is a non-analyzed field that is useful + for aggregations. + + - name: target_path + type: keyword + description: The target path for symlinks. + + - name: type + type: keyword + description: The file type (file, dir, or symlink). + + - name: device + type: keyword + description: The device. + + - name: inode + type: keyword + description: The inode representing the file in the filesystem. + + - name: uid + type: keyword + description: > + The user ID (UID) or security identifier (SID) of the file owner. + + - name: owner + type: keyword + description: The file owner's username. + + - name: gid + type: keyword + description: The primary group ID (GID) of the file. + + - name: group + type: keyword + description: The primary group name of the file. + + - name: mode + type: keyword + example: 0640 + description: The mode of the file in octal representation. + + - name: size + type: long + description: The file size in bytes (field is only added when `type` is `file`). + + - name: mtime + type: date + description: The last modified time of the file (time when content was modified). + + - name: ctime + type: date + description: The last change time of the file (time when metadata was changed). + +# TODO (@ruflin 2018-05-01): These fields are not in ECS. Needs decision or removal. +# +#- name: hash +# group: 3 +# description: > +# Hash fields used in Auditbeat. +# +# The hash field contains cryptographic hashes of data associated with the event +# (such as a file). The keys are names of cryptographic algorithms. The values +# are encoded as hexidecimal (lower-case). +# +# All fields in user can have one or multiple entries. +# fields: +# - name: md5 +# type: keyword +# description: > +# MD5 hash. +# +# - name: sha1 +# type: keyword +# description: > +# SHA-1 hash. +# +# - name: sha224 +# type: keyword +# description: > +# SHA-224 hash (SHA-2 family). +# +# - name: sha256 +# type: keyword +# description: > +# SHA-256 hash (SHA-2 family). +# +# - name: sha384 +# type: keyword +# description: > +# SHA-384 hash (SHA-2 family). +# +# - name: sha512 +# type: keyword +# description: > +# SHA-512 hash (SHA-2 family). +# +# - name: sha512_224 +# type: keyword +# description: > +# SHA-512/224 hash (SHA-2 family). +# +# - name: sha512_256 +# type: keyword +# description: > +# SHA-512/256 hash (SHA-2 family). +# +# - name: sha3_224 +# type: keyword +# description: > +# SHA3-224 hash (SHA-3 family). +# +# - name: sha3_256 +# type: keyword +# description: > +# SHA3-256 hash (SHA-3 family). +# +# - name: sha3_384 +# type: keyword +# description: > +# SHA3-384 hash (SHA-3 family). +# +# - name: sha3_512 +# type: keyword +# description: > +# SHA3-512 hash (SHA-3 family). diff --git a/use-cases/beats.md b/use-cases/beats.md new file mode 100644 index 0000000000..56f7cd2e86 --- /dev/null +++ b/use-cases/beats.md @@ -0,0 +1,18 @@ +## Beats use case + +ECS fields used in Beats. + +### Beats fields + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| [`id`](https://github.com/elastic/ecs#id) | Unique id to describe the event. | keyword | | `8a4f500d` | +| [`timestamp`](https://github.com/elastic/ecs#timestamp) | Timestamp when the event was created. | date | | `2016-05-23T08:05:34.853Z` | +| [`agent.*`](https://github.com/elastic/ecs#agent.*) | The agent fields are used to describe by which beat the information was collected.
| | | | +| [`agent.version`](https://github.com/elastic/ecs#agent.version) | Beat version. | keyword | | `6.0.0-rc2` | +| [`agent.name`](https://github.com/elastic/ecs#agent.name) | Beat name. | keyword | | `filebeat` | +| [`agent.id`](https://github.com/elastic/ecs#agent.id) | Unique beat identifier. | keyword | | `8a4f500d` | + + + diff --git a/use-cases/beats.yml b/use-cases/beats.yml new file mode 100644 index 0000000000..92911bb4dd --- /dev/null +++ b/use-cases/beats.yml @@ -0,0 +1,38 @@ +title: Beats +name: beats +description: + ECS fields used in Beats. +fields: +- name: base + fields: + - name: id + type: keyword + description: > + Unique id to describe the event. + example: 8a4f500d + - name: timestamp + type: date + phase: 1 + example: "2016-05-23T08:05:34.853Z" + description: > + Timestamp when the event was created. + +- name: agent + description: > + The agent fields are used to describe by which beat the information was collected. + fields: + - name: version + type: keyword + description: > + Beat version. + example: 6.0.0-rc2 + - name: name + type: keyword + description: > + Beat name. + example: filebeat + - name: id + type: keyword + description: > + Unique beat identifier. + example: 8a4f500d diff --git a/use-cases/filebeat-apache-access.md b/use-cases/filebeat-apache-access.md new file mode 100644 index 0000000000..d32d4c3092 --- /dev/null +++ b/use-cases/filebeat-apache-access.md @@ -0,0 +1,23 @@ +## Filebeat Apache use case + +ECS fields used in Filebeat for the apache module. + +### Filebeat Apache fields + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| [`id`](https://github.com/elastic/ecs#id) | Unique id to describe the event. | keyword | | `8a4f500d` | +| [`@timestamp`](https://github.com/elastic/ecs#@timestamp) | Timestamp of the log line after processing. | date | | `2016-05-23T08:05:34.853Z` | +| [`message`](https://github.com/elastic/ecs#message) | Log message of the event | date | | `Hello World` | +| [`event.module`](https://github.com/elastic/ecs#event.module) | Currently fileset.module | keyword | | `apache` | +| [`event.dataset`](https://github.com/elastic/ecs#event.dataset) | Currenly fileset.name | keyword | | `access` | +| [`source.ip`](https://github.com/elastic/ecs#source.ip) | Source ip of the request. Currently apache.access.remote_ip | ip | | `192.168.1.1` | +| [`user.name`](https://github.com/elastic/ecs#user.name) | User name in the request. Currently apache.access.user_name | keyword | | `ruflin` | +| [`user_agent.*`](https://github.com/elastic/ecs#user_agent.*) | User agent fields as in schema. Currently under apache.access.user_agent.*
| | | | +| [`user_agent.raw`](https://github.com/elastic/ecs#user_agent.raw) | Raw user agent. Currently apache.access.agent | text | | `http://elastic.co/` | +| [`geoip.*`](https://github.com/elastic/ecs#geoip.*) | User agent fields as in schema. Currently under apache.access.geoip.*
These are extracted from source.ip
Should they be under source.geoip?
| | | | +| [`geoip....`](https://github.com/elastic/ecs#geoip....) | All geoip fields. | text | | | + + + diff --git a/use-cases/filebeat-apache-access.yml b/use-cases/filebeat-apache-access.yml new file mode 100644 index 0000000000..d922c7a437 --- /dev/null +++ b/use-cases/filebeat-apache-access.yml @@ -0,0 +1,113 @@ +title: Filebeat Apache +name: filebeat-apache-access +description: + ECS fields used in Filebeat for the apache module. +fields: +- name: base + fields: + - name: id + type: keyword + description: > + Unique id to describe the event. + example: 8a4f500d + - name: "@timestamp" + type: date + example: "2016-05-23T08:05:34.853Z" + description: > + Timestamp of the log line after processing. + - name: message + type: date + example: "Hello World" + description: > + Log message of the event + +- name: event + fields: + - name: module + type: keyword + description: > + Currently fileset.module + example: apache + - name: dataset + type: keyword + example: access + description: > + Currenly fileset.name + +- name: source + fields: + - name: ip + type: ip + description: > + Source ip of the request. Currently apache.access.remote_ip + example: 192.168.1.1 + +- name: user + fields: + - name: name + type: keyword + description: > + User name in the request. Currently apache.access.user_name + example: ruflin + +# TODO (@ruflin 2018-05-01): These fields are not in ECS. Needs decision or removal. +# +#- name: http +# fields: +# - name: method +# type: keyword +# description: > +# Http method, currently apache.access.method +# example: GET +# - name: url +# type: keyword +# description: > +# Http url, currently apache.access.url +# example: "http://elastic.co/" +# - name: version +# type: keyword +# description: > +# Http version, currently apache.access.http_version +# example: 1.1 +# - name: response.code +# type: keyword +# description: > +# Http response code, currently apache.access.response_code +# example: 404 +# - name: response.body_sent.bytes +# type: long +# description: > +# Http response body bytes sent, currently apache.access.body_sent.bytes +# example: 117 +# - name: referer +# type: keyword +# description: > +# Http referrer code, currently apache.access.referrer +# +# NOTE: In the RFC its misspell as referer and has become accepted standard +# example: http://elastic.co/ + +- name: user_agent + title: User Agent + description: > + User agent fields as in schema. Currently under apache.access.user_agent.* + fields: + - name: raw + type: text + description: > + Raw user agent. Currently apache.access.agent + example: http://elastic.co/ + +- name: geoip + title: Geoip + description: > + User agent fields as in schema. Currently under apache.access.geoip.* + + These are extracted from source.ip + + Should they be under source.geoip? + fields: + - name: ... + type: text + description: > + All geoip fields. diff --git a/use-cases/logging.md b/use-cases/logging.md new file mode 100644 index 0000000000..661d974e29 --- /dev/null +++ b/use-cases/logging.md @@ -0,0 +1,20 @@ +## Logging use case + +ECS fields used in logging use cases. + +### Logging fields + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| [`id`](https://github.com/elastic/ecs#id) | Unique id of the log entry. | keyword | | `8a4f500d` | +| [`timestamp`](https://github.com/elastic/ecs#timestamp) | Timestamp of the log line. | date | | `2016-05-23T08:05:34.853Z` | +| [`message`](https://github.com/elastic/ecs#message) | The log message.
This can contain the full log line or based on the processing only the extracted message part. This is expected to be human readable. | text | | `Hello World` | +| [`hostname`](https://github.com/elastic/ecs#hostname) | Hostname extracted from the log line. | keyword | | `www.example.com` | +| [`ip`](https://github.com/elastic/ecs#ip) | IP Address extracted from the log line. Can be IPv4 or IPv6. | ip | | `192.168.1.12` | +| [`log.level`](https://github.com/elastic/ecs#log.level) | Log level field. Is expected to be `WARN`, `ERR`, `INFO` etc. | keyword | | `ERR` | +| [`log.line`](https://github.com/elastic/ecs#log.line) | Line number the log event was collected from. | long | | `18` | +| [`log.offset`](https://github.com/elastic/ecs#log.offset) | Offset of the log event. | long | | `12` | + + + diff --git a/use-cases/logging.yml b/use-cases/logging.yml new file mode 100644 index 0000000000..e097f4aaa5 --- /dev/null +++ b/use-cases/logging.yml @@ -0,0 +1,69 @@ +title: Logging +name: logging +description: + ECS fields used in logging use cases. +fields: +- name: base + fields: + - name: id + type: keyword + description: > + Unique id of the log entry. + example: 8a4f500d + - name: timestamp + type: date + example: "2016-05-23T08:05:34.853Z" + description: > + Timestamp of the log line. + - name: message + type: text + required: true + example: "Hello World" + description: > + The log message. + + This can contain the full log line or based on the processing + only the extracted message part. This is expected to be human readable. + + - name: hostname + type: keyword + example: "www.example.com" + description: > + Hostname extracted from the log line. + - name: ip + type: ip + example: "192.168.1.12" + description: > + IP Address extracted from the log line. Can be IPv4 or IPv6. + + +- name: log + fields: + - name: level + type: keyword + description: > + Log level field. Is expected to be `WARN`, `ERR`, `INFO` etc. + example: ERR + - name: line + type: long + description: > + Line number the log event was collected from. + example: 18 + - name: offset + type: long + description: > + Offset of the log event. + example: 12 + +# TODO (@ruflin 2018-05-01): These fields are not in ECS. Needs decision or removal. +# Should this be file.path? +# +#- name: source +# description: > +# Describes from where the log entries come from. +# fields: +# - name: path +# type: keyword +# description: > +# File path of the file the data is harvested from. +# example: /var/log/test.log diff --git a/use-cases/metricbeat.md b/use-cases/metricbeat.md new file mode 100644 index 0000000000..e291e29d54 --- /dev/null +++ b/use-cases/metricbeat.md @@ -0,0 +1,30 @@ +## Metricbeat use case + +ECS fields used Metricbeat. + +### Metricbeat fields + + +| Field | Description | Type | Multi Field | Example | +|---|---|---|---|---| +| [`id`](https://github.com/elastic/ecs#id) | Unique id to describe the event. | keyword | | `8a4f500d` | +| [`timestamp`](https://github.com/elastic/ecs#timestamp) | Timestamp when the event was created. | date | | `2016-05-23T08:05:34.853Z` | +| [`agent.version`](https://github.com/elastic/ecs#agent.version) | Beat version. | keyword | | `6.0.0-rc2` | +| [`agent.name`](https://github.com/elastic/ecs#agent.name) | Beat name. | keyword | | `filebeat` | +| [`agent.id`](https://github.com/elastic/ecs#agent.id) | Unique beat identifier. | keyword | | `8a4f500d` | +| [`service.*`](https://github.com/elastic/ecs#service.*) | The service fields describe the service for / from which the data was collected.
If logs or metrics are collected from Redis, `service.name` would be `redis`. This allows to find and correlate logs for a specicic service or even version with `service.version`.
| | | | +| [`service.id`](https://github.com/elastic/ecs#service.id) | Unique identifier of the running service.
This id should uniquely identify this service. This makes it possible to correlate logs and metrics for one specific service. For example in case of issues with one redis instance, it's possible to filter on the id to see metrics and logs for this single instance. | keyword | | `d37e5ebfe0ae6c4972dbe9f0174a1637bb8247f6` | +| [`service.name`](https://github.com/elastic/ecs#service.name) | Name of the service data is collected from.
The name is normally the same as the module name. | keyword | | `elasticsearch` | +| [`service.version`](https://github.com/elastic/ecs#service.version) | Version of the service the data was collected from.
This allows to look at a data set only for a specific version of a service. | keyword | | `3.2.4` | +| [`service.host`](https://github.com/elastic/ecs#service.host) | Host address that is used to connect to the service.
This normally contains hostname + port.
REVIEW: Should this be service.uri instead, sometimes it's more then just the host? It could also include a path or the protocol. | keyword | | `elasticsearch:9200` | +| [`error.*`](https://github.com/elastic/ecs#error.*) | Error namespace
Use for errors which can happen during fetching information for a service.
| | | | +| [`error.message`](https://github.com/elastic/ecs#error.message) | Error message returned by the service during fetching metrics. | text | | | +| [`error.code`](https://github.com/elastic/ecs#error.code) | Error code returned by the service during fetching metrics. | long | | | +| [`host.name`](https://github.com/elastic/ecs#host.name) | Hostname of the system metricbeat is running on or user defined name. | text | | | +| [`host.timezone.offset.sec`](https://github.com/elastic/ecs#host.timezone.offset.sec) | Timezone offset of the host in seconds. | long | | | +| [`host.id`](https://github.com/elastic/ecs#host.id) | Unique host id. | keyword | | | +| [`event.module`](https://github.com/elastic/ecs#event.module) | Name of the module this data is coming from. | keyword | | `mysql` | +| [`event.dataset`](https://github.com/elastic/ecs#event.dataset) | Name of the dataset.
This contains the information which is currently stored in metricset.name and metricset.module. | keyword | | `stats` | + + + diff --git a/use-cases/metricbeat.yml b/use-cases/metricbeat.yml new file mode 100644 index 0000000000..131ad066ec --- /dev/null +++ b/use-cases/metricbeat.yml @@ -0,0 +1,147 @@ +title: Metricbeat +name: metricbeat +description: + ECS fields used Metricbeat. +fields: +- name: base + fields: + - name: id + type: keyword + description: > + Unique id to describe the event. + example: 8a4f500d + - name: timestamp + type: date + phase: 1 + example: "2016-05-23T08:05:34.853Z" + description: > + Timestamp when the event was created. + +- name: agent + fields: + - name: version + type: keyword + description: > + Beat version. + example: 6.0.0-rc2 + - name: name + type: keyword + description: > + Beat name. + example: filebeat + - name: id + type: keyword + description: > + Unique beat identifier. + example: 8a4f500d + +- name: service + description: > + The service fields describe the service for / from which the data was collected. + + If logs or metrics are collected from Redis, `service.name` would be `redis`. This allows + to find and correlate logs for a specicic service or even version with `service.version`. + + fields: + - name: id + type: keyword + description: > + Unique identifier of the running service. + + This id should uniquely identify this service. This makes it possible + to correlate logs and metrics for one specific service. For example + in case of issues with one redis instance, it's possible to filter on the id + to see metrics and logs for this single instance. + + example: d37e5ebfe0ae6c4972dbe9f0174a1637bb8247f6 + + - name: name + type: keyword + example: "elasticsearch" + description: > + Name of the service data is collected from. + + The name is normally the same as the module name. + + - name: version + type: keyword + example: "3.2.4" + description: > + Version of the service the data was collected from. + + This allows to look at a data set only for a specific version of a service. + + - name: host + type: keyword + example: "elasticsearch:9200" + description: > + Host address that is used to connect to the service. + + This normally contains hostname + port. + + REVIEW: Should this be service.uri instead, sometimes it's more then just the host? + It could also include a path or the protocol. + +# TODO (@ruflin 2018-05-01): These fields are not in ECS. Needs decision or removal. +#- name: request +# fields: +# - name: rtt +# type: long +# description: > +# Request round trip time. +# +# How long did the request take to fetch metrics from the service. +# +# REVIEW: THIS DOES NOT EXIST YET IN ECS. +# +# example: 115 + +- name: error + description: > + Error namespace + + Use for errors which can happen during fetching information for a service. + fields: + - name: message + type: text + description: > + Error message returned by the service during fetching metrics. + + - name: code + type: long + description: > + Error code returned by the service during fetching metrics. + +- name: host + fields: + - name: name + type: text + description: > + Hostname of the system metricbeat is running on or user defined name. + + - name: timezone.offset.sec + type: long + description: > + Timezone offset of the host in seconds. + + - name: id + type: keyword + description: > + Unique host id. + +- name: event + fields: + - name: module + type: keyword + description: > + Name of the module this data is coming from. + example: mysql + - name: dataset + type: keyword + description: > + Name of the dataset. + + This contains the information which is currently stored in metricset.name + and metricset.module. + + example: stats