1 /*-
2 * collectd - src/mcelog.c
3 * MIT License
4 *
5 * Copyright(c) 2016 Intel Corporation. All rights reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
25 * Authors:
26 * Maryam Tahhan <maryam.tahhan@intel.com>
27 * Volodymyr Mytnyk <volodymyrx.mytnyk@intel.com>
28 * Taras Chornyi <tarasx.chornyi@intel.com>
29 * Krzysztof Matczak <krzysztofx.matczak@intel.com>
30 */
32 #include "common.h"
33 #include "collectd.h"
35 #include <poll.h>
36 #include <sys/socket.h>
37 #include <sys/un.h>
38 #include <unistd.h>
40 #define MCELOG_PLUGIN "mcelog"
41 #define MCELOG_BUFF_SIZE 1024
42 #define MCELOG_POLL_TIMEOUT 1000 /* ms */
43 #define MCELOG_SOCKET_STR "SOCKET"
44 #define MCELOG_DIMM_NAME "DMI_NAME"
45 #define MCELOG_CORRECTED_ERR "corrected memory errors:"
46 #define MCELOG_UNCORRECTED_ERR "uncorrected memory errors:"
48 typedef struct mcelog_config_s {
49 char logfile[PATH_MAX]; /* mcelog logfile */
50 pthread_t tid; /* poll thread id */
51 } mcelog_config_t;
53 typedef struct socket_adapter_s socket_adapter_t;
55 struct socket_adapter_s {
56 int sock_fd; /* mcelog server socket fd */
57 struct sockaddr_un unix_sock; /* mcelog client socket */
58 pthread_rwlock_t lock;
59 /* function pointers for socket operations */
60 int (*write)(socket_adapter_t *self, const char *msg, const size_t len);
61 int (*reinit)(socket_adapter_t *self);
62 int (*receive)(socket_adapter_t *self, FILE **p_file);
63 int (*close)(socket_adapter_t *self);
64 };
66 typedef struct mcelog_memory_rec_s {
67 int corrected_err_total; /* x total*/
68 int corrected_err_timed; /* x in 24h*/
69 char corrected_err_timed_period[DATA_MAX_NAME_LEN];
70 int uncorrected_err_total; /* x total*/
71 int uncorrected_err_timed; /* x in 24h*/
72 char uncorrected_err_timed_period[DATA_MAX_NAME_LEN];
73 char location[DATA_MAX_NAME_LEN]; /* SOCKET x CHANNEL x DIMM x*/
74 char dimm_name[DATA_MAX_NAME_LEN]; /* DMI_NAME "DIMM_F1" */
75 } mcelog_memory_rec_t;
77 static int socket_close(socket_adapter_t *self);
78 static int socket_write(socket_adapter_t *self, const char *msg,
79 const size_t len);
80 static int socket_reinit(socket_adapter_t *self);
81 static int socket_receive(socket_adapter_t *self, FILE **p_file);
83 static mcelog_config_t g_mcelog_config = {
84 .logfile = "/var/log/mcelog", .tid = 0,
85 };
87 static socket_adapter_t socket_adapter = {
88 .sock_fd = -1,
89 .unix_sock =
90 {
91 .sun_family = AF_UNIX, .sun_path = "/var/run/mcelog-client",
92 },
93 .lock = PTHREAD_RWLOCK_INITIALIZER,
94 .close = socket_close,
95 .write = socket_write,
96 .reinit = socket_reinit,
97 .receive = socket_receive,
98 };
100 static _Bool mcelog_thread_running = 0;
102 static int mcelog_config(oconfig_item_t *ci) {
103 for (int i = 0; i < ci->children_num; i++) {
104 oconfig_item_t *child = ci->children + i;
105 if (strcasecmp("McelogClientSocket", child->key) == 0) {
106 if (cf_util_get_string_buffer(child, socket_adapter.unix_sock.sun_path,
107 sizeof(socket_adapter.unix_sock.sun_path)) <
108 0) {
109 ERROR("%s: Invalid configuration option: \"%s\".", MCELOG_PLUGIN,
110 child->key);
111 return -1;
112 }
113 } else if (strcasecmp("McelogLogfile", child->key) == 0) {
114 if (cf_util_get_string_buffer(child, g_mcelog_config.logfile,
115 sizeof(g_mcelog_config.logfile)) < 0) {
116 ERROR("%s: Invalid configuration option: \"%s\".", MCELOG_PLUGIN,
117 child->key);
118 return -1;
119 }
120 } else {
121 ERROR("%s: Invalid configuration option: \"%s\".", MCELOG_PLUGIN,
122 child->key);
123 return -1;
124 }
125 }
126 return (0);
127 }
129 static int socket_close(socket_adapter_t *self) {
130 int ret = 0;
131 pthread_rwlock_rdlock(&self->lock);
132 if (fcntl(self->sock_fd, F_GETFL) != -1) {
133 if (shutdown(self->sock_fd, SHUT_RDWR) != 0) {
134 char errbuf[MCELOG_BUFF_SIZE];
135 ERROR("%s: Socket shutdown failed: %s", MCELOG_PLUGIN,
136 sstrerror(errno, errbuf, sizeof(errbuf)));
137 ret = -1;
138 }
139 close(self->sock_fd);
140 }
141 pthread_rwlock_unlock(&self->lock);
142 return ret;
143 }
145 static int socket_write(socket_adapter_t *self, const char *msg,
146 const size_t len) {
147 int ret = 0;
148 pthread_rwlock_rdlock(&self->lock);
149 if (swrite(self->sock_fd, msg, len) < 0)
150 ret = -1;
151 pthread_rwlock_unlock(&self->lock);
152 return ret;
153 }
155 static void mcelog_dispatch_notification(notification_t *n) {
156 if (!n) {
157 ERROR(MCELOG_PLUGIN ": %s: NULL pointer", __FUNCTION__);
158 return;
159 }
161 sstrncpy(n->host, hostname_g, sizeof(n->host));
162 sstrncpy(n->type, "gauge", sizeof(n->type));
163 plugin_dispatch_notification(n);
164 if (n->meta)
165 plugin_notification_meta_free(n->meta);
166 }
168 static int socket_reinit(socket_adapter_t *self) {
169 char errbuff[MCELOG_BUFF_SIZE];
170 int ret = -1;
171 cdtime_t interval = plugin_get_interval();
172 struct timeval socket_timeout = CDTIME_T_TO_TIMEVAL(interval);
174 /* synchronization via write lock since sock_fd may be changed here */
175 pthread_rwlock_wrlock(&self->lock);
176 self->sock_fd = socket(PF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
177 if (self->sock_fd < 0) {
178 ERROR("%s: Could not create a socket. %s", MCELOG_PLUGIN,
179 sstrerror(errno, errbuff, sizeof(errbuff)));
180 pthread_rwlock_unlock(&self->lock);
181 return ret;
182 }
184 /* Set socket timeout option */
185 if (setsockopt(self->sock_fd, SOL_SOCKET, SO_SNDTIMEO,
186 &socket_timeout, sizeof(socket_timeout)) < 0)
187 ERROR("%s: Failed to set the socket timeout option.", MCELOG_PLUGIN);
189 /* downgrading to read lock due to possible recursive read locks
190 * in self->close(self) call */
191 pthread_rwlock_unlock(&self->lock);
192 pthread_rwlock_rdlock(&self->lock);
193 if (connect(self->sock_fd, (struct sockaddr *)&(self->unix_sock),
194 sizeof(self->unix_sock)) < 0) {
195 ERROR("%s: Failed to connect to mcelog server. %s", MCELOG_PLUGIN,
196 sstrerror(errno, errbuff, sizeof(errbuff)));
197 self->close(self);
198 ret = -1;
199 } else {
200 ret = 0;
201 mcelog_dispatch_notification(
202 &(notification_t){.severity = NOTIF_OKAY,
203 .time = cdtime(),
204 .message = "Connected to mcelog server",
205 .plugin = MCELOG_PLUGIN,
206 .type_instance = "mcelog_status"});
207 }
208 pthread_rwlock_unlock(&self->lock);
209 return ret;
210 }
212 static int mcelog_prepare_notification(notification_t *n,
213 mcelog_memory_rec_t mr) {
214 if (n == NULL)
215 return (-1);
217 if (plugin_notification_meta_add_string(n, MCELOG_SOCKET_STR, mr.location) <
218 0) {
219 ERROR("%s: add memory location meta data failed", MCELOG_PLUGIN);
220 return (-1);
221 }
222 if (strlen(mr.dimm_name) > 0)
223 if (plugin_notification_meta_add_string(n, MCELOG_DIMM_NAME, mr.dimm_name) <
224 0) {
225 ERROR("%s: add DIMM name meta data failed", MCELOG_PLUGIN);
226 plugin_notification_meta_free(n->meta);
227 return (-1);
228 }
229 if (plugin_notification_meta_add_signed_int(n, MCELOG_CORRECTED_ERR,
230 mr.corrected_err_total) < 0) {
231 ERROR("%s: add corrected errors meta data failed", MCELOG_PLUGIN);
232 plugin_notification_meta_free(n->meta);
233 return (-1);
234 }
235 if (plugin_notification_meta_add_signed_int(
236 n, "corrected memory timed errors", mr.corrected_err_timed) < 0) {
237 ERROR("%s: add corrected timed errors meta data failed", MCELOG_PLUGIN);
238 plugin_notification_meta_free(n->meta);
239 return (-1);
240 }
241 if (plugin_notification_meta_add_string(n, "corrected errors time period",
242 mr.corrected_err_timed_period) < 0) {
243 ERROR("%s: add corrected errors period meta data failed", MCELOG_PLUGIN);
244 plugin_notification_meta_free(n->meta);
245 return (-1);
246 }
247 if (plugin_notification_meta_add_signed_int(n, MCELOG_UNCORRECTED_ERR,
248 mr.uncorrected_err_total) < 0) {
249 ERROR("%s: add corrected errors meta data failed", MCELOG_PLUGIN);
250 plugin_notification_meta_free(n->meta);
251 return (-1);
252 }
253 if (plugin_notification_meta_add_signed_int(
254 n, "uncorrected memory timed errors", mr.uncorrected_err_timed) < 0) {
255 ERROR("%s: add corrected timed errors meta data failed", MCELOG_PLUGIN);
256 plugin_notification_meta_free(n->meta);
257 return (-1);
258 }
259 if (plugin_notification_meta_add_string(n, "uncorrected errors time period",
260 mr.uncorrected_err_timed_period) <
261 0) {
262 ERROR("%s: add corrected errors period meta data failed", MCELOG_PLUGIN);
263 plugin_notification_meta_free(n->meta);
264 return (-1);
265 }
267 return (0);
268 }
270 static int mcelog_submit(mcelog_memory_rec_t mr) {
272 value_list_t vl = VALUE_LIST_INIT;
273 vl.values_len = 1;
274 vl.time = cdtime();
276 sstrncpy(vl.plugin, MCELOG_PLUGIN, sizeof(vl.plugin));
277 sstrncpy(vl.type, "errors", sizeof(vl.type));
278 if (strlen(mr.dimm_name) > 0) {
279 ssnprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "%s_%s",
280 mr.location, mr.dimm_name);
281 } else
282 sstrncpy(vl.plugin_instance, mr.location, sizeof(vl.plugin_instance));
284 sstrncpy(vl.type_instance, "corrected_memory_errors",
285 sizeof(vl.type_instance));
286 vl.values = &(value_t){.derive = (derive_t)mr.corrected_err_total};
287 plugin_dispatch_values(&vl);
289 ssnprintf(vl.type_instance, sizeof(vl.type_instance),
290 "corrected_memory_errors_in_%s", mr.corrected_err_timed_period);
291 vl.values = &(value_t){.derive = (derive_t)mr.corrected_err_timed};
292 plugin_dispatch_values(&vl);
294 sstrncpy(vl.type_instance, "uncorrected_memory_errors",
295 sizeof(vl.type_instance));
296 vl.values = &(value_t){.derive = (derive_t)mr.uncorrected_err_total};
297 plugin_dispatch_values(&vl);
299 ssnprintf(vl.type_instance, sizeof(vl.type_instance),
300 "uncorrected_memory_errors_in_%s", mr.uncorrected_err_timed_period);
301 vl.values = &(value_t){.derive = (derive_t)mr.uncorrected_err_timed};
302 plugin_dispatch_values(&vl);
304 return 0;
305 }
307 static int parse_memory_info(FILE *p_file, mcelog_memory_rec_t *memory_record) {
308 char buf[DATA_MAX_NAME_LEN] = {0};
309 while (fgets(buf, sizeof(buf), p_file)) {
310 /* Got empty line or "done" */
311 if ((!strncmp("\n", buf, strlen(buf))) ||
312 (!strncmp(buf, "done\n", strlen(buf))))
313 return 1;
314 if (strlen(buf) < 5)
315 continue;
316 if (!strncmp(buf, MCELOG_SOCKET_STR, strlen(MCELOG_SOCKET_STR))) {
317 sstrncpy(memory_record->location, buf, strlen(buf));
318 /* replace spaces with '_' */
319 for (size_t i = 0; i < strlen(memory_record->location); i++)
320 if (memory_record->location[i] == ' ')
321 memory_record->location[i] = '_';
322 DEBUG("%s: Got SOCKET INFO %s", MCELOG_PLUGIN, memory_record->location);
323 }
324 if (!strncmp(buf, MCELOG_DIMM_NAME, strlen(MCELOG_DIMM_NAME))) {
325 char *name = NULL;
326 char *saveptr = NULL;
327 name = strtok_r(buf, "\"", &saveptr);
328 if (name != NULL && saveptr != NULL) {
329 name = strtok_r(NULL, "\"", &saveptr);
330 if (name != NULL) {
331 sstrncpy(memory_record->dimm_name, name,
332 sizeof(memory_record->dimm_name));
333 DEBUG("%s: Got DIMM NAME %s", MCELOG_PLUGIN,
334 memory_record->dimm_name);
335 }
336 }
337 }
338 if (!strncmp(buf, MCELOG_CORRECTED_ERR, strlen(MCELOG_CORRECTED_ERR))) {
339 /* Get next line*/
340 if (fgets(buf, sizeof(buf), p_file) != NULL) {
341 sscanf(buf, "\t%d total", &(memory_record->corrected_err_total));
342 DEBUG("%s: Got corrected error total %d", MCELOG_PLUGIN,
343 memory_record->corrected_err_total);
344 }
345 if (fgets(buf, sizeof(buf), p_file) != NULL) {
346 sscanf(buf, "\t%d in %s", &(memory_record->corrected_err_timed),
347 memory_record->corrected_err_timed_period);
348 DEBUG("%s: Got timed corrected errors %d in %s", MCELOG_PLUGIN,
349 memory_record->corrected_err_total,
350 memory_record->corrected_err_timed_period);
351 }
352 }
353 if (!strncmp(buf, MCELOG_UNCORRECTED_ERR, strlen(MCELOG_UNCORRECTED_ERR))) {
354 if (fgets(buf, sizeof(buf), p_file) != NULL) {
355 sscanf(buf, "\t%d total", &(memory_record->uncorrected_err_total));
356 DEBUG("%s: Got uncorrected error total %d", MCELOG_PLUGIN,
357 memory_record->uncorrected_err_total);
358 }
359 if (fgets(buf, sizeof(buf), p_file) != NULL) {
360 sscanf(buf, "\t%d in %s", &(memory_record->uncorrected_err_timed),
361 memory_record->uncorrected_err_timed_period);
362 DEBUG("%s: Got timed uncorrected errors %d in %s", MCELOG_PLUGIN,
363 memory_record->uncorrected_err_total,
364 memory_record->uncorrected_err_timed_period);
365 }
366 }
367 memset(buf, 0, sizeof(buf));
368 }
369 /* parsing definitely finished */
370 return 0;
371 }
373 static void poll_worker_cleanup(void *arg) {
374 mcelog_thread_running = 0;
375 FILE *p_file = *((FILE **)arg);
376 if (p_file != NULL)
377 fclose(p_file);
378 free(arg);
379 }
381 static int socket_receive(socket_adapter_t *self, FILE **pp_file) {
382 int res = -1;
383 pthread_rwlock_rdlock(&self->lock);
384 struct pollfd poll_fd = {
385 .fd = self->sock_fd, .events = POLLIN | POLLPRI,
386 };
388 if ((res = poll(&poll_fd, 1, MCELOG_POLL_TIMEOUT)) <= 0) {
389 if (res != 0 && errno != EINTR) {
390 char errbuf[MCELOG_BUFF_SIZE];
391 ERROR("mcelog: poll failed: %s",
392 sstrerror(errno, errbuf, sizeof(errbuf)));
393 }
394 pthread_rwlock_unlock(&self->lock);
395 return res;
396 }
398 if (poll_fd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
399 /* connection is broken */
400 ERROR("%s: Connection to socket is broken", MCELOG_PLUGIN);
401 if (poll_fd.revents & (POLLERR | POLLHUP)) {
402 mcelog_dispatch_notification(
403 &(notification_t){.severity = NOTIF_FAILURE,
404 .time = cdtime(),
405 .message = "Connection to mcelog socket is broken.",
406 .plugin = MCELOG_PLUGIN,
407 .type_instance = "mcelog_status"});
408 }
409 pthread_rwlock_unlock(&self->lock);
410 return -1;
411 }
413 if (!(poll_fd.revents & (POLLIN | POLLPRI))) {
414 INFO("%s: No data to read", MCELOG_PLUGIN);
415 pthread_rwlock_unlock(&self->lock);
416 return 0;
417 }
419 if ((*pp_file = fdopen(dup(self->sock_fd), "r")) == NULL)
420 res = -1;
422 pthread_rwlock_unlock(&self->lock);
423 return res;
424 }
426 static void *poll_worker(__attribute__((unused)) void *arg) {
427 char errbuf[MCELOG_BUFF_SIZE];
428 mcelog_thread_running = 1;
429 FILE **pp_file = calloc(1, sizeof(*pp_file));
430 if (pp_file == NULL) {
431 ERROR("mcelog: memory allocation failed: %s",
432 sstrerror(errno, errbuf, sizeof(errbuf)));
433 pthread_exit((void *)1);
434 }
436 pthread_cleanup_push(poll_worker_cleanup, pp_file);
438 while (1) {
439 /* blocking call */
440 int res = socket_adapter.receive(&socket_adapter, pp_file);
441 if (res < 0) {
442 socket_adapter.close(&socket_adapter);
443 while (socket_adapter.reinit(&socket_adapter) != 0) {
444 nanosleep(&CDTIME_T_TO_TIMESPEC(MS_TO_CDTIME_T(MCELOG_POLL_TIMEOUT)),
445 NULL);
446 }
447 continue;
448 }
449 /* timeout or no data to read */
450 else if (res == 0)
451 continue;
453 if (*pp_file == NULL)
454 continue;
456 mcelog_memory_rec_t memory_record = {0};
457 while (parse_memory_info(*pp_file, &memory_record)) {
458 /* Check if location was successfully parsed */
459 if (memory_record.location[0] == '\0') {
460 memset(&memory_record, 0, sizeof(memory_record));
461 continue;
462 }
464 notification_t n = {.severity = NOTIF_OKAY,
465 .time = cdtime(),
466 .message = "Got memory errors info.",
467 .plugin = MCELOG_PLUGIN,
468 .type_instance = "memory_erros"};
470 if (mcelog_prepare_notification(&n, memory_record) == 0)
471 mcelog_dispatch_notification(&n);
472 if (mcelog_submit(memory_record) != 0)
473 ERROR("%s: Failed to submit memory errors", MCELOG_PLUGIN);
474 memset(&memory_record, 0, sizeof(memory_record));
475 }
477 fclose(*pp_file);
478 *pp_file = NULL;
479 }
481 mcelog_thread_running = 0;
482 pthread_cleanup_pop(1);
483 return NULL;
484 }
486 static int mcelog_init(void) {
487 if (socket_adapter.reinit(&socket_adapter) != 0) {
488 ERROR("%s: Cannot connect to client socket", MCELOG_PLUGIN);
489 return -1;
490 }
492 if (plugin_thread_create(&g_mcelog_config.tid, NULL, poll_worker, NULL,
493 NULL) != 0) {
494 ERROR("%s: Error creating poll thread.", MCELOG_PLUGIN);
495 return -1;
496 }
497 return 0;
498 }
500 static int get_memory_machine_checks(void) {
501 static const char dump[] = "dump all bios\n";
502 int ret = socket_adapter.write(&socket_adapter, dump, sizeof(dump));
503 if (ret != 0)
504 ERROR("%s: SENT DUMP REQUEST FAILED", MCELOG_PLUGIN);
505 else
506 DEBUG("%s: SENT DUMP REQUEST OK", MCELOG_PLUGIN);
507 return ret;
508 }
510 static int mcelog_read(__attribute__((unused)) user_data_t *ud) {
511 DEBUG("%s: %s", MCELOG_PLUGIN, __FUNCTION__);
513 if (get_memory_machine_checks() != 0)
514 ERROR("%s: MACHINE CHECK INFO NOT AVAILABLE", MCELOG_PLUGIN);
516 return 0;
517 }
519 static int mcelog_shutdown(void) {
520 int ret = 0;
521 if (mcelog_thread_running) {
522 pthread_cancel(g_mcelog_config.tid);
523 if (pthread_join(g_mcelog_config.tid, NULL) != 0) {
524 ERROR("%s: Stopping thread failed.", MCELOG_PLUGIN);
525 ret = -1;
526 }
527 }
529 ret = socket_adapter.close(&socket_adapter) || ret;
530 pthread_rwlock_destroy(&(socket_adapter.lock));
531 return -ret;
532 }
534 void module_register(void) {
535 plugin_register_complex_config(MCELOG_PLUGIN, mcelog_config);
536 plugin_register_init(MCELOG_PLUGIN, mcelog_init);
537 plugin_register_complex_read(NULL, MCELOG_PLUGIN, mcelog_read, 0, NULL);
538 plugin_register_shutdown(MCELOG_PLUGIN, mcelog_shutdown);
539 }