1 /*-
2 * collectd - src/mcelog.c
3 * MIT License
4 *
5 * Copyright(c) 2016 Intel Corporation. All rights reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
25 * Authors:
26 * Maryam Tahhan <maryam.tahhan@intel.com>
27 * Volodymyr Mytnyk <volodymyrx.mytnyk@intel.com>
28 * Taras Chornyi <tarasx.chornyi@intel.com>
29 * Krzysztof Matczak <krzysztofx.matczak@intel.com>
30 */
32 #include "collectd.h"
33 #include "common.h"
35 #include <poll.h>
36 #include <sys/socket.h>
37 #include <sys/un.h>
38 #include <unistd.h>
40 #define MCELOG_PLUGIN "mcelog"
41 #define MCELOG_BUFF_SIZE 1024
42 #define MCELOG_POLL_TIMEOUT 1000 /* ms */
43 #define MCELOG_SOCKET_STR "SOCKET"
44 #define MCELOG_DIMM_NAME "DMI_NAME"
45 #define MCELOG_CORRECTED_ERR "corrected memory errors"
46 #define MCELOG_UNCORRECTED_ERR "uncorrected memory errors"
47 #define MCELOG_CORRECTED_ERR_TYPE_INS "corrected_memory_errors"
48 #define MCELOG_UNCORRECTED_ERR_TYPE_INS "uncorrected_memory_errors"
50 typedef struct mcelog_config_s {
51 char logfile[PATH_MAX]; /* mcelog logfile */
52 pthread_t tid; /* poll thread id */
53 } mcelog_config_t;
55 typedef struct socket_adapter_s socket_adapter_t;
57 struct socket_adapter_s {
58 int sock_fd; /* mcelog server socket fd */
59 struct sockaddr_un unix_sock; /* mcelog client socket */
60 pthread_rwlock_t lock;
61 /* function pointers for socket operations */
62 int (*write)(socket_adapter_t *self, const char *msg, const size_t len);
63 int (*reinit)(socket_adapter_t *self);
64 int (*receive)(socket_adapter_t *self, FILE **p_file);
65 int (*close)(socket_adapter_t *self);
66 };
68 typedef struct mcelog_memory_rec_s {
69 int corrected_err_total; /* x total*/
70 int corrected_err_timed; /* x in 24h*/
71 char corrected_err_timed_period[DATA_MAX_NAME_LEN];
72 int uncorrected_err_total; /* x total*/
73 int uncorrected_err_timed; /* x in 24h*/
74 char uncorrected_err_timed_period[DATA_MAX_NAME_LEN];
75 char location[DATA_MAX_NAME_LEN]; /* SOCKET x CHANNEL x DIMM x*/
76 char dimm_name[DATA_MAX_NAME_LEN]; /* DMI_NAME "DIMM_F1" */
77 } mcelog_memory_rec_t;
79 static int socket_close(socket_adapter_t *self);
80 static int socket_write(socket_adapter_t *self, const char *msg,
81 const size_t len);
82 static int socket_reinit(socket_adapter_t *self);
83 static int socket_receive(socket_adapter_t *self, FILE **p_file);
85 static mcelog_config_t g_mcelog_config = {
86 .logfile = "/var/log/mcelog",
87 };
89 static socket_adapter_t socket_adapter = {
90 .sock_fd = -1,
91 .unix_sock =
92 {
93 .sun_family = AF_UNIX, .sun_path = "/var/run/mcelog-client",
94 },
95 .lock = PTHREAD_RWLOCK_INITIALIZER,
96 .close = socket_close,
97 .write = socket_write,
98 .reinit = socket_reinit,
99 .receive = socket_receive,
100 };
102 static _Bool mcelog_thread_running;
104 static int mcelog_config(oconfig_item_t *ci) {
105 for (int i = 0; i < ci->children_num; i++) {
106 oconfig_item_t *child = ci->children + i;
107 if (strcasecmp("McelogClientSocket", child->key) == 0) {
108 if (cf_util_get_string_buffer(child, socket_adapter.unix_sock.sun_path,
109 sizeof(socket_adapter.unix_sock.sun_path)) <
110 0) {
111 ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
112 child->key);
113 return (-1);
114 }
115 } else if (strcasecmp("McelogLogfile", child->key) == 0) {
116 if (cf_util_get_string_buffer(child, g_mcelog_config.logfile,
117 sizeof(g_mcelog_config.logfile)) < 0) {
118 ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
119 child->key);
120 return (-1);
121 }
122 } else {
123 ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
124 child->key);
125 return (-1);
126 }
127 }
128 return (0);
129 }
131 static int socket_close(socket_adapter_t *self) {
132 int ret = 0;
133 pthread_rwlock_rdlock(&self->lock);
134 if (fcntl(self->sock_fd, F_GETFL) != -1) {
135 char errbuf[MCELOG_BUFF_SIZE];
136 if (shutdown(self->sock_fd, SHUT_RDWR) != 0) {
137 ERROR(MCELOG_PLUGIN ": Socket shutdown failed: %s",
138 sstrerror(errno, errbuf, sizeof(errbuf)));
139 ret = -1;
140 }
141 if (close(self->sock_fd) != 0) {
142 ERROR(MCELOG_PLUGIN ": Socket close failed: %s",
143 sstrerror(errno, errbuf, sizeof(errbuf)));
144 ret = -1;
145 }
146 }
147 pthread_rwlock_unlock(&self->lock);
148 return (ret);
149 }
151 static int socket_write(socket_adapter_t *self, const char *msg,
152 const size_t len) {
153 int ret = 0;
154 pthread_rwlock_rdlock(&self->lock);
155 if (swrite(self->sock_fd, msg, len) < 0)
156 ret = -1;
157 pthread_rwlock_unlock(&self->lock);
158 return (ret);
159 }
161 static void mcelog_dispatch_notification(notification_t *n) {
162 if (!n) {
163 ERROR(MCELOG_PLUGIN ": %s: NULL pointer", __FUNCTION__);
164 return;
165 }
167 sstrncpy(n->host, hostname_g, sizeof(n->host));
168 sstrncpy(n->type, "gauge", sizeof(n->type));
169 plugin_dispatch_notification(n);
170 if (n->meta)
171 plugin_notification_meta_free(n->meta);
172 }
174 static int socket_reinit(socket_adapter_t *self) {
175 char errbuff[MCELOG_BUFF_SIZE];
176 int ret = -1;
177 cdtime_t interval = plugin_get_interval();
178 struct timeval socket_timeout = CDTIME_T_TO_TIMEVAL(interval);
180 /* synchronization via write lock since sock_fd may be changed here */
181 pthread_rwlock_wrlock(&self->lock);
182 self->sock_fd =
183 socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0);
184 if (self->sock_fd < 0) {
185 ERROR(MCELOG_PLUGIN ": Could not create a socket. %s",
186 sstrerror(errno, errbuff, sizeof(errbuff)));
187 pthread_rwlock_unlock(&self->lock);
188 return (ret);
189 }
191 /* Set socket timeout option */
192 if (setsockopt(self->sock_fd, SOL_SOCKET, SO_SNDTIMEO, &socket_timeout,
193 sizeof(socket_timeout)) < 0)
194 ERROR(MCELOG_PLUGIN ": Failed to set the socket timeout option.");
196 /* downgrading to read lock due to possible recursive read locks
197 * in self->close(self) call */
198 pthread_rwlock_unlock(&self->lock);
199 pthread_rwlock_rdlock(&self->lock);
200 if (connect(self->sock_fd, (struct sockaddr *)&(self->unix_sock),
201 sizeof(self->unix_sock)) < 0) {
202 ERROR(MCELOG_PLUGIN ": Failed to connect to mcelog server. %s",
203 sstrerror(errno, errbuff, sizeof(errbuff)));
204 self->close(self);
205 ret = -1;
206 } else {
207 ret = 0;
208 mcelog_dispatch_notification(
209 &(notification_t){.severity = NOTIF_OKAY,
210 .time = cdtime(),
211 .message = "Connected to mcelog server",
212 .plugin = MCELOG_PLUGIN,
213 .type_instance = "mcelog_status"});
214 }
215 pthread_rwlock_unlock(&self->lock);
216 return (ret);
217 }
219 static int mcelog_dispatch_mem_notifications(const mcelog_memory_rec_t *mr) {
220 notification_t n = {
221 .severity = NOTIF_WARNING,
222 .time = cdtime(),
223 .plugin = MCELOG_PLUGIN,
224 .type = "errors"};
226 if (mr == NULL)
227 return (-1);
229 sstrncpy(n.host, hostname_g, sizeof(n.host));
231 if (mr->dimm_name[0] != '\0')
232 ssnprintf(n.plugin_instance, sizeof(n.plugin_instance), "%s_%s",
233 mr->location, mr->dimm_name);
234 else
235 sstrncpy(n.plugin_instance, mr->location, sizeof(n.plugin_instance));
237 /* Corrected Error Notifications */
238 if (mr->corrected_err_total > 0 || mr->corrected_err_timed > 0) {
239 if (plugin_notification_meta_add_signed_int(&n, MCELOG_CORRECTED_ERR,
240 mr->corrected_err_total) < 0) {
241 ERROR(MCELOG_PLUGIN ": add corrected errors meta data failed");
242 plugin_notification_meta_free(n.meta);
243 return (-1);
244 }
245 if (plugin_notification_meta_add_signed_int(
246 &n, "corrected memory timed errors", mr->corrected_err_timed) < 0) {
247 ERROR(MCELOG_PLUGIN ": add corrected timed errors meta data failed");
248 plugin_notification_meta_free(n.meta);
249 return (-1);
250 }
251 ssnprintf(n.message, sizeof(n.message), "Corrected Memory Errors");
252 sstrncpy(n.type_instance, MCELOG_CORRECTED_ERR_TYPE_INS,
253 sizeof(n.type_instance));
254 plugin_dispatch_notification(&n);
256 if (n.meta)
257 plugin_notification_meta_free(n.meta);
258 }
260 /* Uncorrected Error Notifications */
261 if (mr->uncorrected_err_total > 0 || mr->uncorrected_err_timed > 0) {
262 if (plugin_notification_meta_add_signed_int(
263 &n, MCELOG_UNCORRECTED_ERR, mr->uncorrected_err_total) < 0) {
264 ERROR(MCELOG_PLUGIN ": add uncorrected errors meta data failed");
265 plugin_notification_meta_free(n.meta);
266 return (-1);
267 }
268 if (plugin_notification_meta_add_signed_int(
269 &n, "uncorrected memory timed errors", mr->uncorrected_err_timed) <
270 0) {
271 ERROR(MCELOG_PLUGIN ": add uncorrected timed errors meta data failed");
272 plugin_notification_meta_free(n.meta);
273 return (-1);
274 }
275 ssnprintf(n.message, sizeof(n.message), "Uncorrected Memory Errors");
276 sstrncpy(n.type_instance, MCELOG_UNCORRECTED_ERR_TYPE_INS,
277 sizeof(n.type_instance));
278 n.severity = NOTIF_FAILURE;
279 plugin_dispatch_notification(&n);
281 if (n.meta)
282 plugin_notification_meta_free(n.meta);
283 }
285 return (0);
286 }
288 static int mcelog_submit(const mcelog_memory_rec_t *mr) {
290 if (!mr) {
291 ERROR(MCELOG_PLUGIN ": %s: NULL pointer", __FUNCTION__);
292 return (-1);
293 }
295 value_list_t vl = {
296 .values_len = 1,
297 .values = &(value_t){.derive = (derive_t)mr->corrected_err_total},
298 .time = cdtime(),
299 .plugin = MCELOG_PLUGIN,
300 .type = "errors",
301 .type_instance = MCELOG_CORRECTED_ERR_TYPE_INS};
303 if (mr->dimm_name[0] != '\0')
304 ssnprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "%s_%s",
305 mr->location, mr->dimm_name);
306 else
307 sstrncpy(vl.plugin_instance, mr->location, sizeof(vl.plugin_instance));
309 plugin_dispatch_values(&vl);
311 ssnprintf(vl.type_instance, sizeof(vl.type_instance),
312 "corrected_memory_errors_in_%s", mr->corrected_err_timed_period);
313 vl.values = &(value_t){.derive = (derive_t)mr->corrected_err_timed};
314 plugin_dispatch_values(&vl);
316 sstrncpy(vl.type_instance, MCELOG_UNCORRECTED_ERR_TYPE_INS,
317 sizeof(vl.type_instance));
318 vl.values = &(value_t){.derive = (derive_t)mr->uncorrected_err_total};
319 plugin_dispatch_values(&vl);
321 ssnprintf(vl.type_instance, sizeof(vl.type_instance),
322 "uncorrected_memory_errors_in_%s",
323 mr->uncorrected_err_timed_period);
324 vl.values = &(value_t){.derive = (derive_t)mr->uncorrected_err_timed};
325 plugin_dispatch_values(&vl);
327 return (0);
328 }
330 static int parse_memory_info(FILE *p_file, mcelog_memory_rec_t *memory_record) {
331 char buf[DATA_MAX_NAME_LEN] = {0};
332 while (fgets(buf, sizeof(buf), p_file)) {
333 /* Got empty line or "done" */
334 if ((!strncmp("\n", buf, strlen(buf))) ||
335 (!strncmp(buf, "done\n", strlen(buf))))
336 return (1);
337 if (strlen(buf) < 5)
338 continue;
339 if (!strncmp(buf, MCELOG_SOCKET_STR, strlen(MCELOG_SOCKET_STR))) {
340 sstrncpy(memory_record->location, buf, strlen(buf));
341 /* replace spaces with '_' */
342 for (size_t i = 0; i < strlen(memory_record->location); i++)
343 if (memory_record->location[i] == ' ')
344 memory_record->location[i] = '_';
345 DEBUG(MCELOG_PLUGIN ": Got SOCKET INFO %s", memory_record->location);
346 }
347 if (!strncmp(buf, MCELOG_DIMM_NAME, strlen(MCELOG_DIMM_NAME))) {
348 char *name = NULL;
349 char *saveptr = NULL;
350 name = strtok_r(buf, "\"", &saveptr);
351 if (name != NULL && saveptr != NULL) {
352 name = strtok_r(NULL, "\"", &saveptr);
353 if (name != NULL) {
354 sstrncpy(memory_record->dimm_name, name,
355 sizeof(memory_record->dimm_name));
356 DEBUG(MCELOG_PLUGIN ": Got DIMM NAME %s", memory_record->dimm_name);
357 }
358 }
359 }
360 if (!strncmp(buf, MCELOG_CORRECTED_ERR, strlen(MCELOG_CORRECTED_ERR))) {
361 /* Get next line*/
362 if (fgets(buf, sizeof(buf), p_file) != NULL) {
363 sscanf(buf, "\t%d total", &(memory_record->corrected_err_total));
364 DEBUG(MCELOG_PLUGIN ": Got corrected error total %d",
365 memory_record->corrected_err_total);
366 }
367 if (fgets(buf, sizeof(buf), p_file) != NULL) {
368 sscanf(buf, "\t%d in %s", &(memory_record->corrected_err_timed),
369 memory_record->corrected_err_timed_period);
370 DEBUG(MCELOG_PLUGIN ": Got timed corrected errors %d in %s",
371 memory_record->corrected_err_total,
372 memory_record->corrected_err_timed_period);
373 }
374 }
375 if (!strncmp(buf, MCELOG_UNCORRECTED_ERR, strlen(MCELOG_UNCORRECTED_ERR))) {
376 if (fgets(buf, sizeof(buf), p_file) != NULL) {
377 sscanf(buf, "\t%d total", &(memory_record->uncorrected_err_total));
378 DEBUG(MCELOG_PLUGIN ": Got uncorrected error total %d",
379 memory_record->uncorrected_err_total);
380 }
381 if (fgets(buf, sizeof(buf), p_file) != NULL) {
382 sscanf(buf, "\t%d in %s", &(memory_record->uncorrected_err_timed),
383 memory_record->uncorrected_err_timed_period);
384 DEBUG(MCELOG_PLUGIN ": Got timed uncorrected errors %d in %s",
385 memory_record->uncorrected_err_total,
386 memory_record->uncorrected_err_timed_period);
387 }
388 }
389 memset(buf, 0, sizeof(buf));
390 }
391 /* parsing definitely finished */
392 return (0);
393 }
395 static void poll_worker_cleanup(void *arg) {
396 mcelog_thread_running = 0;
397 FILE *p_file = *((FILE **)arg);
398 if (p_file != NULL)
399 fclose(p_file);
400 free(arg);
401 }
403 static int socket_receive(socket_adapter_t *self, FILE **pp_file) {
404 int res = -1;
405 pthread_rwlock_rdlock(&self->lock);
406 struct pollfd poll_fd = {
407 .fd = self->sock_fd, .events = POLLIN | POLLPRI,
408 };
410 if ((res = poll(&poll_fd, 1, MCELOG_POLL_TIMEOUT)) <= 0) {
411 if (res != 0 && errno != EINTR) {
412 char errbuf[MCELOG_BUFF_SIZE];
413 ERROR("mcelog: poll failed: %s",
414 sstrerror(errno, errbuf, sizeof(errbuf)));
415 }
416 pthread_rwlock_unlock(&self->lock);
417 return (res);
418 }
420 if (poll_fd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
421 /* connection is broken */
422 ERROR(MCELOG_PLUGIN ": Connection to socket is broken");
423 if (poll_fd.revents & (POLLERR | POLLHUP)) {
424 mcelog_dispatch_notification(
425 &(notification_t){.severity = NOTIF_FAILURE,
426 .time = cdtime(),
427 .message = "Connection to mcelog socket is broken.",
428 .plugin = MCELOG_PLUGIN,
429 .type_instance = "mcelog_status"});
430 }
431 pthread_rwlock_unlock(&self->lock);
432 return (-1);
433 }
435 if (!(poll_fd.revents & (POLLIN | POLLPRI))) {
436 INFO(MCELOG_PLUGIN ": No data to read");
437 pthread_rwlock_unlock(&self->lock);
438 return (0);
439 }
441 if ((*pp_file = fdopen(dup(self->sock_fd), "r")) == NULL)
442 res = -1;
444 pthread_rwlock_unlock(&self->lock);
445 return (res);
446 }
448 static void *poll_worker(__attribute__((unused)) void *arg) {
449 char errbuf[MCELOG_BUFF_SIZE];
450 mcelog_thread_running = 1;
451 FILE **pp_file = calloc(1, sizeof(*pp_file));
452 if (pp_file == NULL) {
453 ERROR("mcelog: memory allocation failed: %s",
454 sstrerror(errno, errbuf, sizeof(errbuf)));
455 pthread_exit((void *)1);
456 }
458 pthread_cleanup_push(poll_worker_cleanup, pp_file);
460 while (1) {
461 /* blocking call */
462 int res = socket_adapter.receive(&socket_adapter, pp_file);
463 if (res < 0) {
464 socket_adapter.close(&socket_adapter);
465 while (socket_adapter.reinit(&socket_adapter) != 0) {
466 nanosleep(&CDTIME_T_TO_TIMESPEC(MS_TO_CDTIME_T(MCELOG_POLL_TIMEOUT)),
467 NULL);
468 }
469 continue;
470 }
471 /* timeout or no data to read */
472 else if (res == 0)
473 continue;
475 if (*pp_file == NULL)
476 continue;
478 mcelog_memory_rec_t memory_record = {0};
479 while (parse_memory_info(*pp_file, &memory_record)) {
480 /* Check if location was successfully parsed */
481 if (memory_record.location[0] == '\0') {
482 memset(&memory_record, 0, sizeof(memory_record));
483 continue;
484 }
486 if (mcelog_dispatch_mem_notifications(&memory_record) != 0)
487 ERROR(MCELOG_PLUGIN ": Failed to submit memory errors notification");
488 if (mcelog_submit(&memory_record) != 0)
489 ERROR(MCELOG_PLUGIN ": Failed to submit memory errors");
490 memset(&memory_record, 0, sizeof(memory_record));
491 }
493 fclose(*pp_file);
494 *pp_file = NULL;
495 }
497 mcelog_thread_running = 0;
498 pthread_cleanup_pop(1);
499 return (NULL);
500 }
502 static int mcelog_init(void) {
503 if (socket_adapter.reinit(&socket_adapter) != 0) {
504 ERROR(MCELOG_PLUGIN ": Cannot connect to client socket");
505 return (-1);
506 }
508 if (plugin_thread_create(&g_mcelog_config.tid, NULL, poll_worker, NULL,
509 NULL) != 0) {
510 ERROR(MCELOG_PLUGIN ": Error creating poll thread.");
511 return (-1);
512 }
513 return (0);
514 }
516 static int get_memory_machine_checks(void) {
517 static const char dump[] = "dump all bios\n";
518 int ret = socket_adapter.write(&socket_adapter, dump, sizeof(dump));
519 if (ret != 0)
520 ERROR(MCELOG_PLUGIN ": SENT DUMP REQUEST FAILED");
521 else
522 DEBUG(MCELOG_PLUGIN ": SENT DUMP REQUEST OK");
523 return (ret);
524 }
526 static int mcelog_read(__attribute__((unused)) user_data_t *ud) {
527 DEBUG(MCELOG_PLUGIN ": %s", __FUNCTION__);
529 if (get_memory_machine_checks() != 0)
530 ERROR(MCELOG_PLUGIN ": MACHINE CHECK INFO NOT AVAILABLE");
532 return (0);
533 }
535 static int mcelog_shutdown(void) {
536 int ret = 0;
537 if (mcelog_thread_running) {
538 pthread_cancel(g_mcelog_config.tid);
539 if (pthread_join(g_mcelog_config.tid, NULL) != 0) {
540 ERROR(MCELOG_PLUGIN ": Stopping thread failed.");
541 ret = -1;
542 }
543 }
545 ret = socket_adapter.close(&socket_adapter) || ret;
546 pthread_rwlock_destroy(&(socket_adapter.lock));
547 return (-ret);
548 }
550 void module_register(void) {
551 plugin_register_complex_config(MCELOG_PLUGIN, mcelog_config);
552 plugin_register_init(MCELOG_PLUGIN, mcelog_init);
553 plugin_register_complex_read(NULL, MCELOG_PLUGIN, mcelog_read, 0, NULL);
554 plugin_register_shutdown(MCELOG_PLUGIN, mcelog_shutdown);
555 }