feat: begin work on everything :P

This commit is contained in:
xaseiresh 2023-05-10 09:11:23 +02:00
parent 03b77c7395
commit dca5944549
6 changed files with 445 additions and 6 deletions

46
Gemfile.lock Normal file
View file

@ -0,0 +1,46 @@
PATH
remote: .
specs:
timeseries-hoarder (0.1.0)
influxparser (~> 0.0.5)
pg (~> 1.5)
GEM
remote: https://rubygems.org/
specs:
ast (2.4.2)
influxparser (0.0.5)
json (2.6.3)
parallel (1.22.1)
parser (3.2.2.0)
ast (~> 2.4.1)
pg (1.5.3)
rainbow (3.1.1)
rake (13.0.6)
regexp_parser (2.7.0)
rexml (3.2.5)
rubocop (1.50.1)
json (~> 2.3)
parallel (~> 1.10)
parser (>= 3.2.0.0)
rainbow (>= 2.2.2, < 4.0)
regexp_parser (>= 1.8, < 3.0)
rexml (>= 3.2.5, < 4.0)
rubocop-ast (>= 1.28.0, < 2.0)
ruby-progressbar (~> 1.7)
unicode-display_width (>= 2.4.0, < 3.0)
rubocop-ast (1.28.0)
parser (>= 3.2.1.0)
ruby-progressbar (1.13.0)
unicode-display_width (2.4.2)
PLATFORMS
x86_64-linux
DEPENDENCIES
rake (~> 13.0)
rubocop (~> 1.21)
timeseries-hoarder!
BUNDLED WITH
2.3.26

View file

@ -0,0 +1,83 @@
require_relative 'Table.rb'
require 'time'
require 'json'
module Timeseries
module Hoarder
class CachingTable < Table
def initialize(db, name, content_name = 'tags', tag_access_update_delay: 60)
@content_name = content_name
@id_column = content_name + '_id'
super(db, name, 'ts_hoarder')
@known_tags = {}
@tag_access_times = {}
@tag_access_updates = {}
@tag_access_update_delay = tag_access_update_delay
end
def table_creation
@pg.exec("CREATE TABLE ts_hoarder.#{@table_name} ( #{@id_column} SERIAL PRIMARY KEY, #{@content_name} JSONB, created_at TIMESTAMPTZ, last_used TIMESTAMPTZ )")
@pg.exec("CREATE INDEX ON ts_hoarder.#{@table_name} USING GIN ( #{@content_name} )")
end
def load_cache_content
@pg.exec("SELECT * FROM ts_hoarder.#{@table_name}") do |results|
results.each do |tuple|
tags = JSON.parse(tuple[@content_name])
@known_tags[tags] = tuple[@id_column]
@tag_access_times[tags] = Time.parse(tuple['last_used'])
end
end
true
end
def create_entry(tags)
return @known_tags[tags] if @known_tags.include? tags
returned_id = nil
@pg.transaction do
@pg.exec("LOCK TABLE ts_hoarder.#{@table_name}")
res = @pg.exec_params("SELECT * FROM ts_hoarder.#{@table_name} WHERE #{@content_name} = $1::jsonb", [tags.to_json])
if(res.num_tuples >= 1)
returned_id = res[0][@id_column]
@known_tags[tags] = returned_id
@tag_access_times[tags] = Time.parse(res[0]['last_used'])
else
res = @pg.exec_params("INSERT INTO ts_hoarder.#{@table_name} (#{@content_name}, created_at, last_used) VALUES ($1::jsonb, NOW(), NOW()) RETURNING #{@id_column}", [tags.to_json])
returned_id = res[0][@id_column]
@known_tags[tags] = returned_id
@tag_access_times[tags] = Time.now()
end
end
returned_id
end
def [](tags)
access_time = Time.now()
if(((@tag_access_times[tags] || Time.at(0)) - Time.now()) > @tag_access_update_delay)
@tag_access_times[tags] = access_time
@tag_access_updates[tags] = true
end
known_id = @known_tags[tags]
return known_id unless known_id.nil?
return create_entry(tags)
end
end
end
end

View file

@ -0,0 +1,42 @@
module Timeseries
module Hoarder
class Table
def initialize(db, table_name, table_schema = "public")
@table_name = table_name
@table_schema = table_schema
@db = db
@pg = @db.pg
@created = false
ensure_table_exists
end
def ensure_table_exists
return if @created
@pg.transaction do
@pg.exec("SELECT pg_advisory_lock(0)")
r = @pg.exec_params("SELECT 1 FROM information_schema.tables WHERE table_name = $1 AND table_schema = $2", [@table_name, @schema_name])
if r.num_tuples >= 1
@created = true
return
end
table_creation
@created = true
end
end
def table_creation
raise "No table creation string method provided!"
end
end
end
end

View file

@ -0,0 +1,21 @@
require 'pg'
require_relative 'CachingTable.rb'
module Timeseries
module Hoarder
class Database
attr_reader :pg
attr_reader :data_sources
def initialize(pg)
@pg = pg
@pg.exec("CREATE SCHEMA IF NOT EXISTS ts_hoarder")
@data_sources = CachingTable.new(self, 'sources', 'source')
end
end
end
end

247
telegraf_psql_ingestor.rb Normal file
View file

@ -0,0 +1,247 @@
require 'pg'
require 'json'
require 'influxparser'
Process.setproctitle('telegraf-tsdb-ingest')
CONNECT_URL = ARGV[1] || "user=postgres dbname=ingestor_test"
SCHEMA = 'telegraf_ingest'
$pg = PG.connect(CONNECT_URL)
$known_tables = {}
$high_cardinality_tags = {
'process_name' => true,
'pid' => true
}
$source_tags = {
'host' => true,
'location' => true
}
def ensure_schema_exists(schema)
$pg.exec("CREATE SCHEMA IF NOT EXISTS #{schema}");
end
ensure_schema_exists(SCHEMA)
def grab_table_list()
$pg.exec("SELECT * FROM pg_catalog.pg_tables WHERE schemaname IN ('#{SCHEMA}', 'public');") do |result|
result.each do |tuple|
table = tuple['schemaname'] + '.' + tuple['tablename']
$known_tables[table] = true
end
end
end
grab_table_list
class DedupContainer
attr_reader :cache
def initialize(pg, dataname, datatype)
@pg = pg
@dataname = dataname
@datatype = datatype
@tablename = "#{SCHEMA}.#{@dataname}s"
@id_column = "#{@dataname}_id"
@cache = {}
setup_table unless $known_tables[@tablename]
end
def setup_table()
@pg.exec("CREATE TABLE #{@tablename} ( #{@id_column} SERIAL PRIMARY KEY, #{@dataname} #{@datatype} UNIQUE)")
@pg.exec("CREATE INDEX ON #{@tablename} #{@datatype == 'JSONB' ? 'USING GIN' : ''} (#{@dataname})")
end
def load_table()
@pg.exec("SELECT * FROM #{@tablename}") do |result|
result.each do |tuple|
@cache[tuple[@dataname]] = tuple[@id_column]
end
end
end
def add_key(key)
key_str = key
if @datatype == 'JSONB'
key_str = key.to_json
end
upsert_statement = "INSERT INTO #{@tablename}(#{@dataname}) VALUES ($1::#{@datatype}) RETURNING #{@id_column}"
id_res = @pg.exec_params("SELECT #{@id_column} FROM #{@tablename} WHERE #{@dataname} = $1::#{@datatype}", [key_str])
if(id_res.ntuples == 0)
id_res = @pg.exec_params(upsert_statement, [key_str]);
end
key_id = id_res[0][@id_column].to_i
@cache[key] = key_id
key_id
end
def [](key)
r = @cache[key]
return r unless r.nil?
add_key key
end
end
class TimeseriesTable
attr_reader :internal_tablename
def initialize(pg, tablename)
@pg = pg
@tablename = tablename
@internal_tablename = "#{SCHEMA}._timeseries_#{tablename}"
@chunk_time_interval = '1d'
@compression_interval = '2d'
@retention_time = '6 months'
setup_tables unless $known_tables[@internal_tablename]
end
def setup_tables
@pg.exec <<-SQL
CREATE TABLE #{@internal_tablename} (
time TIMESTAMPTZ NOT NULL,
source_id INT NOT NULL,
tag_id INT NOT NULL,
metric_id INT NOT NULL,
high_cardinality_tags JSONB,
ts_value NUMERIC,
FOREIGN KEY (source_id) REFERENCES #{SCHEMA}.sources (source_id),
FOREIGN KEY (tag_id) REFERENCES #{SCHEMA}.tags (tag_id),
FOREIGN KEY (metric_id) REFERENCES #{SCHEMA}.metrics (metric_id)
)
SQL
@pg.exec "CREATE INDEX ON #{@internal_tablename} (metric_id, source_id, tag_id)"
@pg.exec "SELECT * FROM create_hypertable('#{@internal_tablename}', 'time', chunk_time_interval => INTERVAL '#{@chunk_time_interval}')"
@pg.exec "ALTER TABLE #{@internal_tablename} SET (timescaledb.compress, timescaledb.compress_segmentby = 'source_id, tag_id, metric_id, high_cardinality_tags')"
@pg.exec "SELECT * FROM add_compression_policy('#{@internal_tablename}', INTERVAL '#{@compression_interval}')"
@pg.exec "SELECT * FROM add_retention_policy('#{@internal_tablename}', INTERVAL '#{@retention_time}')"
@pg.exec <<-SQL
CREATE VIEW #{@tablename} AS (
SELECT time, source, tag, metric, high_cardinality_tags, ts_value
FROM #{@internal_tablename}
INNER JOIN #{SCHEMA}.sources USING (source_id)
INNER JOIN #{SCHEMA}.tags USING (tag_id)
INNER JOIN #{SCHEMA}.metrics USING (metric_id)
)
SQL
end
end
$known_sources = DedupContainer.new($pg, 'source', 'JSONB');
$known_metrics = DedupContainer.new($pg, 'metric', 'VARCHAR');
$known_tags = DedupContainer.new($pg, 'tag', 'JSONB');
$timeseries = {}
puts $known_tables
puts $known_sources
puts $known_sources[{"host" => "xnm-core.lucidragons.de"}]
def ingest_line(line)
begin
line = InfluxParser.parse_point(line)
rescue => e
STDERR.puts "Error in line protocol parsing: #{e}"
return
end
series = $timeseries[line['series']]
if(series.nil?)
series = TimeseriesTable.new($pg, line['series']);
$timeseries[line['series']] = series
end
line_source_tags = {}
line_series_tags = {}
line_high_cardinality_tags = {}
tags = line['tags']
values = line['values']
if tags.include? 'metric' and values.include? 'value'
metric = tags['metric']
values[tags['metric']] = values['value']
tags.delete 'metric'
values.delete 'value'
end
tags.each do |tag, tag_value|
if($source_tags[tag])
line_source_tags[tag] = tag_value
elsif($high_cardinality_tags[tag])
line_high_cardinality_tags[tag] = tag_value
else
line_series_tags[tag] = tag_value
end
end
line_high_cardinality_tags = nil if line_high_cardinality_tags.empty?
timestamp = Time.at(line['timestamp'].to_f * 1e-9);
line_source_id = $known_sources[line_source_tags]
line_series_id = $known_tags[line_series_tags]
metric_ids_array = []
values_array = []
values.each do |metric, value|
next unless value.is_a? Numeric
metric_ids_array << $known_metrics[metric]
values_array << value
end
puts "Inserting into #{series.internal_tablename}"
metric_ids_array = '{'+metric_ids_array.join(',')+'}'
values_array = '{'+values_array.join(',')+'}'
insert_statement = <<-SQL
INSERT INTO #{series.internal_tablename}
(time, source_id, tag_id, metric_id, ts_value, high_cardinality_tags)
VALUES ($1::timestamptz, $2::int, $3::int, unnest($4::int[]), unnest($5::numeric[]), $6::jsonb)
SQL
$pg.exec_params(insert_statement, [timestamp, line_source_id, line_series_id, metric_ids_array, values_array, line_high_cardinality_tags.to_json])
end
$stdin.sync = true
lines_queue = Queue.new
Thread.new do
loop do
lines_queue << STDIN.gets
end
end
loop do
sleep 10
next if lines_queue.empty?
$pg.transaction do
ingest_line(lines_queue.pop) until lines_queue.empty?
end
end

View file

@ -8,16 +8,15 @@ Gem::Specification.new do |spec|
spec.authors = ["xaseiresh"]
spec.email = ["davidbailey.2889@gmail.com"]
spec.summary = "TODO: Write a short summary, because RubyGems requires one."
spec.description = "TODO: Write a longer description or delete this line."
spec.homepage = "TODO: Put your gem's website or public repo URL here."
spec.summary = "Quick&Dirty time series ingestor gem"
spec.description = "Quickly, comfortably, reliably and flexibly ingest your Influx-Style time series into TimescaleDB"
spec.homepage = "https://forgejo.lucidragons.de/lucidergs/timeseries-hoarder"
spec.required_ruby_version = ">= 2.6.0"
spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
spec.metadata["homepage_uri"] = spec.homepage
spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
spec.metadata["source_code_uri"] = "https://forgejo.lucidragons.de/lucidergs/timeseries-hoarder"
# Specify which files should be added to the gem when it is released.
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
@ -31,7 +30,8 @@ Gem::Specification.new do |spec|
spec.require_paths = ["lib"]
# Uncomment to register a new dependency of your gem
# spec.add_dependency "example-gem", "~> 1.0"
spec.add_dependency "pg", "~> 1.5"
spec.add_dependency "influxparser", "~> 0.0.5"
# For more information and examples about making a new gem, check out our
# guide at: https://bundler.io/guides/creating_gem.html