mirror of
https://github.com/PaiGramTeam/telegram-bot-api.git
synced 2024-11-16 04:35:33 +00:00
Add watchdog for main thread hanging.
This commit is contained in:
parent
90910f6ded
commit
aa9eff357c
@ -85,6 +85,7 @@ set(TELEGRAM_BOT_API_SOURCE
|
|||||||
telegram-bot-api/HttpStatConnection.cpp
|
telegram-bot-api/HttpStatConnection.cpp
|
||||||
telegram-bot-api/Query.cpp
|
telegram-bot-api/Query.cpp
|
||||||
telegram-bot-api/Stats.cpp
|
telegram-bot-api/Stats.cpp
|
||||||
|
telegram-bot-api/Watchdog.cpp
|
||||||
telegram-bot-api/WebhookActor.cpp
|
telegram-bot-api/WebhookActor.cpp
|
||||||
|
|
||||||
telegram-bot-api/Client.h
|
telegram-bot-api/Client.h
|
||||||
@ -95,6 +96,7 @@ set(TELEGRAM_BOT_API_SOURCE
|
|||||||
telegram-bot-api/HttpStatConnection.h
|
telegram-bot-api/HttpStatConnection.h
|
||||||
telegram-bot-api/Query.h
|
telegram-bot-api/Query.h
|
||||||
telegram-bot-api/Stats.h
|
telegram-bot-api/Stats.h
|
||||||
|
telegram-bot-api/Watchdog.h
|
||||||
telegram-bot-api/WebhookActor.h
|
telegram-bot-api/WebhookActor.h
|
||||||
)
|
)
|
||||||
|
|
||||||
|
27
telegram-bot-api/Watchdog.cpp
Normal file
27
telegram-bot-api/Watchdog.cpp
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
//
|
||||||
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
|
||||||
|
//
|
||||||
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||||
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||||
|
//
|
||||||
|
#include "telegram-bot-api/Watchdog.h"
|
||||||
|
|
||||||
|
#include "td/utils/Time.h"
|
||||||
|
|
||||||
|
namespace telegram_bot_api {
|
||||||
|
|
||||||
|
void Watchdog::kick() {
|
||||||
|
auto now = td::Time::now();
|
||||||
|
if (now >= last_kick_time_ + timeout_ && last_kick_time_ > 0) {
|
||||||
|
LOG(ERROR) << "Watchdog timeout expired after " << now - last_kick_time_ << " seconds";
|
||||||
|
td::thread::send_real_time_signal(main_thread_id_, 2);
|
||||||
|
}
|
||||||
|
last_kick_time_ = now;
|
||||||
|
set_timeout_in(timeout_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Watchdog::timeout_expired() {
|
||||||
|
kick();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace telegram_bot_api
|
31
telegram-bot-api/Watchdog.h
Normal file
31
telegram-bot-api/Watchdog.h
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
//
|
||||||
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
|
||||||
|
//
|
||||||
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||||
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||||
|
//
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "td/actor/actor.h"
|
||||||
|
|
||||||
|
#include "td/utils/port/thread.h"
|
||||||
|
|
||||||
|
namespace telegram_bot_api {
|
||||||
|
|
||||||
|
class Watchdog final : public td::Actor {
|
||||||
|
public:
|
||||||
|
Watchdog(td::thread::id main_thread_id, double timeout) : main_thread_id_(main_thread_id), timeout_(timeout) {
|
||||||
|
// watchdog is disabled until it is kicked for the first time
|
||||||
|
}
|
||||||
|
|
||||||
|
void kick();
|
||||||
|
|
||||||
|
private:
|
||||||
|
void timeout_expired() final;
|
||||||
|
|
||||||
|
td::thread::id main_thread_id_;
|
||||||
|
double timeout_;
|
||||||
|
double last_kick_time_ = 0.0;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace telegram_bot_api
|
@ -11,6 +11,7 @@
|
|||||||
#include "telegram-bot-api/HttpStatConnection.h"
|
#include "telegram-bot-api/HttpStatConnection.h"
|
||||||
#include "telegram-bot-api/Query.h"
|
#include "telegram-bot-api/Query.h"
|
||||||
#include "telegram-bot-api/Stats.h"
|
#include "telegram-bot-api/Stats.h"
|
||||||
|
#include "telegram-bot-api/Watchdog.h"
|
||||||
|
|
||||||
#include "td/telegram/ClientActor.h"
|
#include "td/telegram/ClientActor.h"
|
||||||
|
|
||||||
@ -462,7 +463,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// << (td::GitInfo::is_dirty() ? "(dirty)" : "") << " started";
|
// << (td::GitInfo::is_dirty() ? "(dirty)" : "") << " started";
|
||||||
LOG(WARNING) << "Bot API " << parameters->version_ << " server started";
|
LOG(WARNING) << "Bot API " << parameters->version_ << " server started";
|
||||||
|
|
||||||
const int threads_n = 5; // +3 for Td, one for slow HTTP connections and one for DNS resolving
|
const int threads_n = 6; // +3 for Td, one for watchdog, one for slow HTTP connections, one for DNS resolving
|
||||||
td::ConcurrentScheduler sched;
|
td::ConcurrentScheduler sched;
|
||||||
sched.init(threads_n);
|
sched.init(threads_n);
|
||||||
|
|
||||||
@ -474,6 +475,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
auto client_manager =
|
auto client_manager =
|
||||||
sched.create_actor_unsafe<ClientManager>(0, "ClientManager", std::move(parameters), token_range).release();
|
sched.create_actor_unsafe<ClientManager>(0, "ClientManager", std::move(parameters), token_range).release();
|
||||||
|
|
||||||
sched
|
sched
|
||||||
.create_actor_unsafe<HttpServer>(
|
.create_actor_unsafe<HttpServer>(
|
||||||
0, "HttpServer", http_ip_address, http_port,
|
0, "HttpServer", http_ip_address, http_port,
|
||||||
@ -482,6 +484,7 @@ int main(int argc, char *argv[]) {
|
|||||||
td::create_actor<HttpConnection>("HttpConnection", client_manager, shared_data));
|
td::create_actor<HttpConnection>("HttpConnection", client_manager, shared_data));
|
||||||
})
|
})
|
||||||
.release();
|
.release();
|
||||||
|
|
||||||
if (http_stat_port != 0) {
|
if (http_stat_port != 0) {
|
||||||
sched
|
sched
|
||||||
.create_actor_unsafe<HttpServer>(
|
.create_actor_unsafe<HttpServer>(
|
||||||
@ -492,8 +495,14 @@ int main(int argc, char *argv[]) {
|
|||||||
})
|
})
|
||||||
.release();
|
.release();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr double WATCHDOG_TIMEOUT = 0.5;
|
||||||
|
auto watchdog_id =
|
||||||
|
sched.create_actor_unsafe<Watchdog>(threads_n - 2, "Watchdog", td::this_thread::get_id(), WATCHDOG_TIMEOUT);
|
||||||
|
|
||||||
sched.start();
|
sched.start();
|
||||||
|
|
||||||
|
double next_watchdog_kick_time = start_time;
|
||||||
double next_cron_time = start_time;
|
double next_cron_time = start_time;
|
||||||
double last_dump_time = start_time - 1000.0;
|
double last_dump_time = start_time - 1000.0;
|
||||||
double last_tqueue_gc_time = start_time - 1000.0;
|
double last_tqueue_gc_time = start_time - 1000.0;
|
||||||
@ -503,7 +512,7 @@ int main(int argc, char *argv[]) {
|
|||||||
std::atomic_bool can_quit{false};
|
std::atomic_bool can_quit{false};
|
||||||
ServerCpuStat::instance(); // create ServerCpuStat instance
|
ServerCpuStat::instance(); // create ServerCpuStat instance
|
||||||
while (true) {
|
while (true) {
|
||||||
sched.run_main(next_cron_time - td::Time::now());
|
sched.run_main(td::min(next_cron_time, next_watchdog_kick_time) - td::Time::now());
|
||||||
|
|
||||||
if (!need_reopen_log.test_and_set()) {
|
if (!need_reopen_log.test_and_set()) {
|
||||||
td::log_interface->after_rotation();
|
td::log_interface->after_rotation();
|
||||||
@ -519,6 +528,7 @@ int main(int argc, char *argv[]) {
|
|||||||
dump_statistics(shared_data, net_query_stats);
|
dump_statistics(shared_data, net_query_stats);
|
||||||
close_flag = true;
|
close_flag = true;
|
||||||
auto guard = sched.get_main_guard();
|
auto guard = sched.get_main_guard();
|
||||||
|
watchdog_id.reset();
|
||||||
send_closure(client_manager, &ClientManager::close, td::PromiseCreator::lambda([&can_quit](td::Unit) {
|
send_closure(client_manager, &ClientManager::close, td::PromiseCreator::lambda([&can_quit](td::Unit) {
|
||||||
can_quit.store(true);
|
can_quit.store(true);
|
||||||
td::Scheduler::instance()->yield();
|
td::Scheduler::instance()->yield();
|
||||||
@ -557,6 +567,12 @@ int main(int argc, char *argv[]) {
|
|||||||
ServerCpuStat::update(now);
|
ServerCpuStat::update(now);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (now >= start_time + 600) {
|
||||||
|
auto guard = sched.get_main_guard();
|
||||||
|
send_closure(watchdog_id, &Watchdog::kick);
|
||||||
|
next_watchdog_kick_time = now + WATCHDOG_TIMEOUT / 2;
|
||||||
|
}
|
||||||
|
|
||||||
if (now > last_tqueue_gc_time + 60.0) {
|
if (now > last_tqueue_gc_time + 60.0) {
|
||||||
auto unix_time = shared_data->get_unix_time(now);
|
auto unix_time = shared_data->get_unix_time(now);
|
||||||
LOG(INFO) << "Run TQueue GC at " << unix_time;
|
LOG(INFO) << "Run TQueue GC at " << unix_time;
|
||||||
|
Loading…
Reference in New Issue
Block a user