1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// This file is part of Substrate.

// Copyright (C) 2017-2022 Parity Technologies (UK) Ltd.
// SPDX-License-Identifier: GPL-3.0-or-later WITH Classpath-exception-2.0

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

use crate::TelemetryPayload;
use futures::{channel::mpsc, prelude::*};
use libp2p::{core::transport::Transport, Multiaddr};
use rand::Rng as _;
use std::{
	fmt, mem,
	pin::Pin,
	task::{Context, Poll},
	time::Duration,
};
use wasm_timer::Delay;

pub(crate) type ConnectionNotifierSender = mpsc::Sender<()>;
pub(crate) type ConnectionNotifierReceiver = mpsc::Receiver<()>;

pub(crate) fn connection_notifier_channel() -> (ConnectionNotifierSender, ConnectionNotifierReceiver)
{
	mpsc::channel(0)
}

/// Handler for a single telemetry node.
///
/// This is a wrapper `Sink` around a network `Sink` with 3 particularities:
///  - It is infallible: if the connection stops, it will reconnect automatically when the server
///    becomes available again.
///  - It holds a list of "connection messages" which are sent automatically when the connection is
///    (re-)established. This is used for the "system.connected" message that needs to be send for
///    every substrate node that connects.
///  - It doesn't stay in pending while waiting for connection. Instead, it moves data into the void
///    if the connection could not be established. This is important for the `Dispatcher` `Sink`
///    which we don't want to block if one connection is broken.
#[derive(Debug)]
pub(crate) struct Node<TTrans: Transport> {
	/// Address of the node.
	addr: Multiaddr,
	/// State of the connection.
	socket: NodeSocket<TTrans>,
	/// Transport used to establish new connections.
	transport: TTrans,
	/// Messages that are sent when the connection (re-)establishes.
	pub(crate) connection_messages: Vec<TelemetryPayload>,
	/// Notifier for when the connection (re-)establishes.
	pub(crate) telemetry_connection_notifier: Vec<ConnectionNotifierSender>,
}

enum NodeSocket<TTrans: Transport> {
	/// We're connected to the node. This is the normal state.
	Connected(NodeSocketConnected<TTrans>),
	/// We are currently dialing the node.
	Dialing(TTrans::Dial),
	/// A new connection should be started as soon as possible.
	ReconnectNow,
	/// Waiting before attempting to dial again.
	WaitingReconnect(Delay),
	/// Temporary transition state.
	Poisoned,
}

impl<TTrans: Transport> NodeSocket<TTrans> {
	fn wait_reconnect() -> NodeSocket<TTrans> {
		let random_delay = rand::thread_rng().gen_range(10, 20);
		let delay = Delay::new(Duration::from_secs(random_delay));
		log::trace!(target: "telemetry", "Pausing for {} secs before reconnecting", random_delay);
		NodeSocket::WaitingReconnect(delay)
	}
}

struct NodeSocketConnected<TTrans: Transport> {
	/// Where to send data.
	sink: TTrans::Output,
	/// Queue of packets to send before accepting new packets.
	buf: Vec<Vec<u8>>,
}

impl<TTrans: Transport> Node<TTrans> {
	/// Builds a new node handler.
	pub(crate) fn new(
		transport: TTrans,
		addr: Multiaddr,
		connection_messages: Vec<serde_json::Map<String, serde_json::Value>>,
		telemetry_connection_notifier: Vec<ConnectionNotifierSender>,
	) -> Self {
		Node {
			addr,
			socket: NodeSocket::ReconnectNow,
			transport,
			connection_messages,
			telemetry_connection_notifier,
		}
	}
}

impl<TTrans: Transport, TSinkErr> Node<TTrans>
where
	TTrans::Dial: Unpin,
	TTrans::Output:
		Sink<Vec<u8>, Error = TSinkErr> + Stream<Item = Result<Vec<u8>, TSinkErr>> + Unpin,
	TSinkErr: fmt::Debug,
{
	// NOTE: this code has been inspired from `Buffer` (`futures_util::sink::Buffer`).
	//       https://docs.rs/futures-util/0.3.8/src/futures_util/sink/buffer.rs.html#32
	fn try_send_connection_messages(
		self: Pin<&mut Self>,
		cx: &mut Context<'_>,
		conn: &mut NodeSocketConnected<TTrans>,
	) -> Poll<Result<(), TSinkErr>> {
		while let Some(item) = conn.buf.pop() {
			if let Err(e) = conn.sink.start_send_unpin(item) {
				return Poll::Ready(Err(e))
			}
			futures::ready!(conn.sink.poll_ready_unpin(cx))?;
		}
		Poll::Ready(Ok(()))
	}
}

pub(crate) enum Infallible {}

impl<TTrans: Transport, TSinkErr> Sink<TelemetryPayload> for Node<TTrans>
where
	TTrans: Unpin,
	TTrans::Dial: Unpin,
	TTrans::Output:
		Sink<Vec<u8>, Error = TSinkErr> + Stream<Item = Result<Vec<u8>, TSinkErr>> + Unpin,
	TSinkErr: fmt::Debug,
{
	type Error = Infallible;

	fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Result<(), Self::Error>> {
		let mut socket = mem::replace(&mut self.socket, NodeSocket::Poisoned);
		self.socket = loop {
			match socket {
				NodeSocket::Connected(mut conn) => match conn.sink.poll_ready_unpin(cx) {
					Poll::Ready(Ok(())) => {
						match self.as_mut().try_send_connection_messages(cx, &mut conn) {
							Poll::Ready(Err(err)) => {
								log::warn!(target: "telemetry", "⚠️  Disconnected from {}: {:?}", self.addr, err);
								socket = NodeSocket::wait_reconnect();
							},
							Poll::Ready(Ok(())) => {
								self.socket = NodeSocket::Connected(conn);
								return Poll::Ready(Ok(()))
							},
							Poll::Pending => {
								self.socket = NodeSocket::Connected(conn);
								return Poll::Pending
							},
						}
					},
					Poll::Ready(Err(err)) => {
						log::warn!(target: "telemetry", "⚠️  Disconnected from {}: {:?}", self.addr, err);
						socket = NodeSocket::wait_reconnect();
					},
					Poll::Pending => {
						self.socket = NodeSocket::Connected(conn);
						return Poll::Pending
					},
				},
				NodeSocket::Dialing(mut s) => match Future::poll(Pin::new(&mut s), cx) {
					Poll::Ready(Ok(sink)) => {
						log::debug!(target: "telemetry", "✅ Connected to {}", self.addr);

						{
							let mut index = 0;
							while index < self.telemetry_connection_notifier.len() {
								let sender = &mut self.telemetry_connection_notifier[index];
								if let Err(error) = sender.try_send(()) {
									if !error.is_disconnected() {
										log::debug!(target: "telemetry", "Failed to send a telemetry connection notification: {}", error);
									} else {
										self.telemetry_connection_notifier.swap_remove(index);
										continue
									}
								}
								index += 1;
							}
						}

						let buf = self
							.connection_messages
							.iter()
							.map(|json| {
								let mut json = json.clone();
								json.insert(
									"ts".to_string(),
									chrono::Local::now().to_rfc3339().into(),
								);
								json
							})
							.filter_map(|json| match serde_json::to_vec(&json) {
								Ok(message) => Some(message),
								Err(err) => {
									log::error!(
										target: "telemetry",
										"An error occurred while generating new connection \
										messages: {}",
										err,
									);
									None
								},
							})
							.collect();

						socket = NodeSocket::Connected(NodeSocketConnected { sink, buf });
					},
					Poll::Pending => break NodeSocket::Dialing(s),
					Poll::Ready(Err(err)) => {
						log::warn!(target: "telemetry", "❌ Error while dialing {}: {:?}", self.addr, err);
						socket = NodeSocket::wait_reconnect();
					},
				},
				NodeSocket::ReconnectNow => {
					let addr = self.addr.clone();
					match self.transport.dial(addr) {
						Ok(d) => {
							log::trace!(target: "telemetry", "Re-dialing {}", self.addr);
							socket = NodeSocket::Dialing(d);
						},
						Err(err) => {
							log::warn!(target: "telemetry", "❌ Error while re-dialing {}: {:?}", self.addr, err);
							socket = NodeSocket::wait_reconnect();
						},
					}
				},
				NodeSocket::WaitingReconnect(mut s) => {
					if Future::poll(Pin::new(&mut s), cx).is_ready() {
						socket = NodeSocket::ReconnectNow;
					} else {
						break NodeSocket::WaitingReconnect(s)
					}
				},
				NodeSocket::Poisoned => {
					log::error!(target: "telemetry", "‼️ Poisoned connection with {}", self.addr);
					break NodeSocket::Poisoned
				},
			}
		};

		// The Dispatcher blocks when the Node syncs blocks. This is why it is important that the
		// Node sinks don't go into "Pending" state while waiting for reconnection but rather
		// discard the excess of telemetry messages.
		Poll::Ready(Ok(()))
	}

	fn start_send(mut self: Pin<&mut Self>, item: TelemetryPayload) -> Result<(), Self::Error> {
		// Any buffered outgoing telemetry messages are discarded while (re-)connecting.
		match &mut self.socket {
			NodeSocket::Connected(conn) => match serde_json::to_vec(&item) {
				Ok(data) => {
					log::trace!(target: "telemetry", "Sending {} bytes", data.len());
					let _ = conn.sink.start_send_unpin(data);
				},
				Err(err) => log::debug!(
					target: "telemetry",
					"Could not serialize payload: {}",
					err,
				),
			},
			// We are currently dialing the node.
			NodeSocket::Dialing(_) => log::trace!(target: "telemetry", "Dialing"),
			// A new connection should be started as soon as possible.
			NodeSocket::ReconnectNow => log::trace!(target: "telemetry", "Reconnecting"),
			// Waiting before attempting to dial again.
			NodeSocket::WaitingReconnect(_) => {},
			// Temporary transition state.
			NodeSocket::Poisoned => log::trace!(target: "telemetry", "Poisoned"),
		}
		Ok(())
	}

	fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
		match &mut self.socket {
			NodeSocket::Connected(conn) => match conn.sink.poll_flush_unpin(cx) {
				Poll::Ready(Err(e)) => {
					// When `telemetry` closes the websocket connection we end
					// up here, which is sub-optimal. See
					// https://github.com/libp2p/rust-libp2p/issues/2021 for
					// what we could do to improve this.
					log::trace!(target: "telemetry", "[poll_flush] Error: {:?}", e);
					self.socket = NodeSocket::wait_reconnect();
					Poll::Ready(Ok(()))
				},
				Poll::Ready(Ok(())) => Poll::Ready(Ok(())),
				Poll::Pending => Poll::Pending,
			},
			_ => Poll::Ready(Ok(())),
		}
	}

	fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
		match &mut self.socket {
			NodeSocket::Connected(conn) => conn.sink.poll_close_unpin(cx).map(|_| Ok(())),
			_ => Poll::Ready(Ok(())),
		}
	}
}

impl<TTrans: Transport> fmt::Debug for NodeSocket<TTrans> {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
		use NodeSocket::*;
		f.write_str(match self {
			Connected(_) => "Connected",
			Dialing(_) => "Dialing",
			ReconnectNow => "ReconnectNow",
			WaitingReconnect(_) => "WaitingReconnect",
			Poisoned => "Poisoned",
		})
	}
}