1 /*-
2 * collectd - src/mcelog.c
3 * MIT License
4 *
5 * Copyright(c) 2016 Intel Corporation. All rights reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
25 * Authors:
26 * Maryam Tahhan <maryam.tahhan@intel.com>
27 * Volodymyr Mytnyk <volodymyrx.mytnyk@intel.com>
28 * Taras Chornyi <tarasx.chornyi@intel.com>
29 * Krzysztof Matczak <krzysztofx.matczak@intel.com>
30 */
32 #include "common.h"
33 #include "collectd.h"
35 #include <poll.h>
36 #include <sys/socket.h>
37 #include <sys/un.h>
38 #include <unistd.h>
40 #define MCELOG_PLUGIN "mcelog"
41 #define MCELOG_BUFF_SIZE 1024
42 #define MCELOG_POLL_TIMEOUT 1000 /* ms */
43 #define MCELOG_SOCKET_STR "SOCKET"
44 #define MCELOG_DIMM_NAME "DMI_NAME"
45 #define MCELOG_CORRECTED_ERR "corrected memory errors"
46 #define MCELOG_UNCORRECTED_ERR "uncorrected memory errors"
48 typedef struct mcelog_config_s {
49 char logfile[PATH_MAX]; /* mcelog logfile */
50 pthread_t tid; /* poll thread id */
51 } mcelog_config_t;
53 typedef struct socket_adapter_s socket_adapter_t;
55 struct socket_adapter_s {
56 int sock_fd; /* mcelog server socket fd */
57 struct sockaddr_un unix_sock; /* mcelog client socket */
58 pthread_rwlock_t lock;
59 /* function pointers for socket operations */
60 int (*write)(socket_adapter_t *self, const char *msg, const size_t len);
61 int (*reinit)(socket_adapter_t *self);
62 int (*receive)(socket_adapter_t *self, FILE **p_file);
63 int (*close)(socket_adapter_t *self);
64 };
66 typedef struct mcelog_memory_rec_s {
67 int corrected_err_total; /* x total*/
68 int corrected_err_timed; /* x in 24h*/
69 char corrected_err_timed_period[DATA_MAX_NAME_LEN];
70 int uncorrected_err_total; /* x total*/
71 int uncorrected_err_timed; /* x in 24h*/
72 char uncorrected_err_timed_period[DATA_MAX_NAME_LEN];
73 char location[DATA_MAX_NAME_LEN]; /* SOCKET x CHANNEL x DIMM x*/
74 char dimm_name[DATA_MAX_NAME_LEN]; /* DMI_NAME "DIMM_F1" */
75 } mcelog_memory_rec_t;
77 static int socket_close(socket_adapter_t *self);
78 static int socket_write(socket_adapter_t *self, const char *msg,
79 const size_t len);
80 static int socket_reinit(socket_adapter_t *self);
81 static int socket_receive(socket_adapter_t *self, FILE **p_file);
83 static mcelog_config_t g_mcelog_config = {.logfile = "/var/log/mcelog"};
85 static socket_adapter_t socket_adapter = {
86 .sock_fd = -1,
87 .unix_sock =
88 {
89 .sun_family = AF_UNIX, .sun_path = "/var/run/mcelog-client",
90 },
91 .lock = PTHREAD_RWLOCK_INITIALIZER,
92 .close = socket_close,
93 .write = socket_write,
94 .reinit = socket_reinit,
95 .receive = socket_receive,
96 };
98 static _Bool mcelog_thread_running;
100 static int mcelog_config(oconfig_item_t *ci) {
101 for (int i = 0; i < ci->children_num; i++) {
102 oconfig_item_t *child = ci->children + i;
103 if (strcasecmp("McelogClientSocket", child->key) == 0) {
104 if (cf_util_get_string_buffer(child, socket_adapter.unix_sock.sun_path,
105 sizeof(socket_adapter.unix_sock.sun_path)) <
106 0) {
107 ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
108 child->key);
109 return -1;
110 }
111 } else if (strcasecmp("McelogLogfile", child->key) == 0) {
112 if (cf_util_get_string_buffer(child, g_mcelog_config.logfile,
113 sizeof(g_mcelog_config.logfile)) < 0) {
114 ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
115 child->key);
116 return -1;
117 }
118 } else {
119 ERROR(MCELOG_PLUGIN ": Invalid configuration option: \"%s\".",
120 child->key);
121 return -1;
122 }
123 }
124 return (0);
125 }
127 static int socket_close(socket_adapter_t *self) {
128 int ret = 0;
129 pthread_rwlock_rdlock(&self->lock);
130 if (fcntl(self->sock_fd, F_GETFL) != -1) {
131 char errbuf[MCELOG_BUFF_SIZE];
132 if (shutdown(self->sock_fd, SHUT_RDWR) != 0) {
133 ERROR(MCELOG_PLUGIN ": Socket shutdown failed: %s",
134 sstrerror(errno, errbuf, sizeof(errbuf)));
135 ret = -1;
136 }
137 if (close(self->sock_fd) != 0) {
138 ERROR(MCELOG_PLUGIN ": Socket close failed: %s",
139 sstrerror(errno, errbuf, sizeof(errbuf)));
140 ret = -1;
141 }
142 }
143 pthread_rwlock_unlock(&self->lock);
144 return ret;
145 }
147 static int socket_write(socket_adapter_t *self, const char *msg,
148 const size_t len) {
149 int ret = 0;
150 pthread_rwlock_rdlock(&self->lock);
151 if (swrite(self->sock_fd, msg, len) < 0)
152 ret = -1;
153 pthread_rwlock_unlock(&self->lock);
154 return ret;
155 }
157 static void mcelog_dispatch_notification(notification_t *n) {
158 if (!n) {
159 ERROR(MCELOG_PLUGIN ": %s: NULL pointer", __FUNCTION__);
160 return;
161 }
163 sstrncpy(n->host, hostname_g, sizeof(n->host));
164 sstrncpy(n->type, "gauge", sizeof(n->type));
165 plugin_dispatch_notification(n);
166 if (n->meta)
167 plugin_notification_meta_free(n->meta);
168 }
170 static int socket_reinit(socket_adapter_t *self) {
171 char errbuff[MCELOG_BUFF_SIZE];
172 int ret = -1;
173 cdtime_t interval = plugin_get_interval();
174 struct timeval socket_timeout = CDTIME_T_TO_TIMEVAL(interval);
176 /* synchronization via write lock since sock_fd may be changed here */
177 pthread_rwlock_wrlock(&self->lock);
178 self->sock_fd =
179 socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0);
180 if (self->sock_fd < 0) {
181 ERROR(MCELOG_PLUGIN ": Could not create a socket. %s",
182 sstrerror(errno, errbuff, sizeof(errbuff)));
183 pthread_rwlock_unlock(&self->lock);
184 return ret;
185 }
187 /* Set socket timeout option */
188 if (setsockopt(self->sock_fd, SOL_SOCKET, SO_SNDTIMEO, &socket_timeout,
189 sizeof(socket_timeout)) < 0)
190 ERROR(MCELOG_PLUGIN ": Failed to set the socket timeout option.");
192 /* downgrading to read lock due to possible recursive read locks
193 * in self->close(self) call */
194 pthread_rwlock_unlock(&self->lock);
195 pthread_rwlock_rdlock(&self->lock);
196 if (connect(self->sock_fd, (struct sockaddr *)&(self->unix_sock),
197 sizeof(self->unix_sock)) < 0) {
198 ERROR(MCELOG_PLUGIN ": Failed to connect to mcelog server. %s",
199 sstrerror(errno, errbuff, sizeof(errbuff)));
200 self->close(self);
201 ret = -1;
202 } else {
203 ret = 0;
204 mcelog_dispatch_notification(
205 &(notification_t){.severity = NOTIF_OKAY,
206 .time = cdtime(),
207 .message = "Connected to mcelog server",
208 .plugin = MCELOG_PLUGIN,
209 .type_instance = "mcelog_status"});
210 }
211 pthread_rwlock_unlock(&self->lock);
212 return ret;
213 }
215 static int mcelog_prepare_notification(notification_t *n,
216 const mcelog_memory_rec_t *mr) {
217 if (n == NULL || mr == NULL)
218 return (-1);
220 if (mr->location[0] != '\0')
221 if (plugin_notification_meta_add_string(n, MCELOG_SOCKET_STR,
222 mr->location) < 0) {
223 ERROR(MCELOG_PLUGIN ": add memory location meta data failed");
224 return (-1);
225 }
226 if (mr->dimm_name[0] != '\0')
227 if (plugin_notification_meta_add_string(n, MCELOG_DIMM_NAME,
228 mr->dimm_name) < 0) {
229 ERROR(MCELOG_PLUGIN ": add DIMM name meta data failed");
230 plugin_notification_meta_free(n->meta);
231 return (-1);
232 }
233 if (plugin_notification_meta_add_signed_int(n, MCELOG_CORRECTED_ERR,
234 mr->corrected_err_total) < 0) {
235 ERROR(MCELOG_PLUGIN ": add corrected errors meta data failed");
236 plugin_notification_meta_free(n->meta);
237 return (-1);
238 }
239 if (plugin_notification_meta_add_signed_int(
240 n, "corrected memory timed errors", mr->corrected_err_timed) < 0) {
241 ERROR(MCELOG_PLUGIN ": add corrected timed errors meta data failed");
242 plugin_notification_meta_free(n->meta);
243 return (-1);
244 }
245 if (mr->corrected_err_timed_period[0] != '\0')
246 if (plugin_notification_meta_add_string(n, "corrected errors time period",
247 mr->corrected_err_timed_period) <
248 0) {
249 ERROR(MCELOG_PLUGIN ": add corrected errors period meta data failed");
250 plugin_notification_meta_free(n->meta);
251 return (-1);
252 }
253 if (plugin_notification_meta_add_signed_int(n, MCELOG_UNCORRECTED_ERR,
254 mr->uncorrected_err_total) < 0) {
255 ERROR(MCELOG_PLUGIN ": add corrected errors meta data failed");
256 plugin_notification_meta_free(n->meta);
257 return (-1);
258 }
259 if (plugin_notification_meta_add_signed_int(n,
260 "uncorrected memory timed errors",
261 mr->uncorrected_err_timed) < 0) {
262 ERROR(MCELOG_PLUGIN ": add corrected timed errors meta data failed");
263 plugin_notification_meta_free(n->meta);
264 return (-1);
265 }
266 if (mr->uncorrected_err_timed_period[0] != '\0')
267 if (plugin_notification_meta_add_string(n, "uncorrected errors time period",
268 mr->uncorrected_err_timed_period) <
269 0) {
270 ERROR(MCELOG_PLUGIN ": add corrected errors period meta data failed");
271 plugin_notification_meta_free(n->meta);
272 return (-1);
273 }
275 return (0);
276 }
278 static int mcelog_submit(const mcelog_memory_rec_t *mr) {
280 if (!mr) {
281 ERROR(MCELOG_PLUGIN ": %s: NULL pointer", __FUNCTION__);
282 return -1;
283 }
285 value_list_t vl = {
286 .values_len = 1,
287 .values = &(value_t){.derive = (derive_t)mr->corrected_err_total},
288 .time = cdtime(),
289 .plugin = MCELOG_PLUGIN,
290 .type = "errors",
291 .type_instance = "corrected_memory_errors"};
293 if (mr->dimm_name[0] != '\0')
294 ssnprintf(vl.plugin_instance, sizeof(vl.plugin_instance), "%s_%s",
295 mr->location, mr->dimm_name);
296 else
297 sstrncpy(vl.plugin_instance, mr->location, sizeof(vl.plugin_instance));
299 plugin_dispatch_values(&vl);
301 ssnprintf(vl.type_instance, sizeof(vl.type_instance),
302 "corrected_memory_errors_in_%s", mr->corrected_err_timed_period);
303 vl.values = &(value_t){.derive = (derive_t)mr->corrected_err_timed};
304 plugin_dispatch_values(&vl);
306 sstrncpy(vl.type_instance, "uncorrected_memory_errors",
307 sizeof(vl.type_instance));
308 vl.values = &(value_t){.derive = (derive_t)mr->uncorrected_err_total};
309 plugin_dispatch_values(&vl);
311 ssnprintf(vl.type_instance, sizeof(vl.type_instance),
312 "uncorrected_memory_errors_in_%s",
313 mr->uncorrected_err_timed_period);
314 vl.values = &(value_t){.derive = (derive_t)mr->uncorrected_err_timed};
315 plugin_dispatch_values(&vl);
317 return 0;
318 }
320 static int parse_memory_info(FILE *p_file, mcelog_memory_rec_t *memory_record) {
321 char buf[DATA_MAX_NAME_LEN] = {0};
322 while (fgets(buf, sizeof(buf), p_file)) {
323 /* Got empty line or "done" */
324 if ((!strncmp("\n", buf, strlen(buf))) ||
325 (!strncmp(buf, "done\n", strlen(buf))))
326 return 1;
327 if (strlen(buf) < 5)
328 continue;
329 if (!strncmp(buf, MCELOG_SOCKET_STR, strlen(MCELOG_SOCKET_STR))) {
330 sstrncpy(memory_record->location, buf, strlen(buf));
331 /* replace spaces with '_' */
332 for (size_t i = 0; i < strlen(memory_record->location); i++)
333 if (memory_record->location[i] == ' ')
334 memory_record->location[i] = '_';
335 DEBUG(MCELOG_PLUGIN ": Got SOCKET INFO %s", memory_record->location);
336 }
337 if (!strncmp(buf, MCELOG_DIMM_NAME, strlen(MCELOG_DIMM_NAME))) {
338 char *name = NULL;
339 char *saveptr = NULL;
340 name = strtok_r(buf, "\"", &saveptr);
341 if (name != NULL && saveptr != NULL) {
342 name = strtok_r(NULL, "\"", &saveptr);
343 if (name != NULL) {
344 sstrncpy(memory_record->dimm_name, name,
345 sizeof(memory_record->dimm_name));
346 DEBUG(MCELOG_PLUGIN ": Got DIMM NAME %s", memory_record->dimm_name);
347 }
348 }
349 }
350 if (!strncmp(buf, MCELOG_CORRECTED_ERR, strlen(MCELOG_CORRECTED_ERR))) {
351 /* Get next line*/
352 if (fgets(buf, sizeof(buf), p_file) != NULL) {
353 sscanf(buf, "\t%d total", &(memory_record->corrected_err_total));
354 DEBUG(MCELOG_PLUGIN ": Got corrected error total %d",
355 memory_record->corrected_err_total);
356 }
357 if (fgets(buf, sizeof(buf), p_file) != NULL) {
358 sscanf(buf, "\t%d in %s", &(memory_record->corrected_err_timed),
359 memory_record->corrected_err_timed_period);
360 DEBUG(MCELOG_PLUGIN ": Got timed corrected errors %d in %s",
361 memory_record->corrected_err_total,
362 memory_record->corrected_err_timed_period);
363 }
364 }
365 if (!strncmp(buf, MCELOG_UNCORRECTED_ERR, strlen(MCELOG_UNCORRECTED_ERR))) {
366 if (fgets(buf, sizeof(buf), p_file) != NULL) {
367 sscanf(buf, "\t%d total", &(memory_record->uncorrected_err_total));
368 DEBUG(MCELOG_PLUGIN ": Got uncorrected error total %d",
369 memory_record->uncorrected_err_total);
370 }
371 if (fgets(buf, sizeof(buf), p_file) != NULL) {
372 sscanf(buf, "\t%d in %s", &(memory_record->uncorrected_err_timed),
373 memory_record->uncorrected_err_timed_period);
374 DEBUG(MCELOG_PLUGIN ": Got timed uncorrected errors %d in %s",
375 memory_record->uncorrected_err_total,
376 memory_record->uncorrected_err_timed_period);
377 }
378 }
379 memset(buf, 0, sizeof(buf));
380 }
381 /* parsing definitely finished */
382 return 0;
383 }
385 static void poll_worker_cleanup(void *arg) {
386 mcelog_thread_running = 0;
387 FILE *p_file = *((FILE **)arg);
388 if (p_file != NULL)
389 fclose(p_file);
390 free(arg);
391 }
393 static int socket_receive(socket_adapter_t *self, FILE **pp_file) {
394 int res = -1;
395 pthread_rwlock_rdlock(&self->lock);
396 struct pollfd poll_fd = {
397 .fd = self->sock_fd, .events = POLLIN | POLLPRI,
398 };
400 if ((res = poll(&poll_fd, 1, MCELOG_POLL_TIMEOUT)) <= 0) {
401 if (res != 0 && errno != EINTR) {
402 char errbuf[MCELOG_BUFF_SIZE];
403 ERROR("mcelog: poll failed: %s",
404 sstrerror(errno, errbuf, sizeof(errbuf)));
405 }
406 pthread_rwlock_unlock(&self->lock);
407 return res;
408 }
410 if (poll_fd.revents & (POLLERR | POLLHUP | POLLNVAL)) {
411 /* connection is broken */
412 ERROR(MCELOG_PLUGIN ": Connection to socket is broken");
413 if (poll_fd.revents & (POLLERR | POLLHUP)) {
414 mcelog_dispatch_notification(
415 &(notification_t){.severity = NOTIF_FAILURE,
416 .time = cdtime(),
417 .message = "Connection to mcelog socket is broken.",
418 .plugin = MCELOG_PLUGIN,
419 .type_instance = "mcelog_status"});
420 }
421 pthread_rwlock_unlock(&self->lock);
422 return -1;
423 }
425 if (!(poll_fd.revents & (POLLIN | POLLPRI))) {
426 INFO(MCELOG_PLUGIN ": No data to read");
427 pthread_rwlock_unlock(&self->lock);
428 return 0;
429 }
431 if ((*pp_file = fdopen(dup(self->sock_fd), "r")) == NULL)
432 res = -1;
434 pthread_rwlock_unlock(&self->lock);
435 return res;
436 }
438 static void *poll_worker(__attribute__((unused)) void *arg) {
439 char errbuf[MCELOG_BUFF_SIZE];
440 mcelog_thread_running = 1;
441 FILE **pp_file = calloc(1, sizeof(*pp_file));
442 if (pp_file == NULL) {
443 ERROR("mcelog: memory allocation failed: %s",
444 sstrerror(errno, errbuf, sizeof(errbuf)));
445 pthread_exit((void *)1);
446 }
448 pthread_cleanup_push(poll_worker_cleanup, pp_file);
450 while (1) {
451 /* blocking call */
452 int res = socket_adapter.receive(&socket_adapter, pp_file);
453 if (res < 0) {
454 socket_adapter.close(&socket_adapter);
455 while (socket_adapter.reinit(&socket_adapter) != 0) {
456 nanosleep(&CDTIME_T_TO_TIMESPEC(MS_TO_CDTIME_T(MCELOG_POLL_TIMEOUT)),
457 NULL);
458 }
459 continue;
460 }
461 /* timeout or no data to read */
462 else if (res == 0)
463 continue;
465 if (*pp_file == NULL)
466 continue;
468 mcelog_memory_rec_t memory_record = {0};
469 while (parse_memory_info(*pp_file, &memory_record)) {
470 /* Check if location was successfully parsed */
471 if (memory_record.location[0] == '\0') {
472 memset(&memory_record, 0, sizeof(memory_record));
473 continue;
474 }
476 notification_t n = {.severity = NOTIF_OKAY,
477 .time = cdtime(),
478 .message = "Got memory errors info.",
479 .plugin = MCELOG_PLUGIN,
480 .type_instance = "memory_erros"};
482 if (mcelog_prepare_notification(&n, &memory_record) == 0)
483 mcelog_dispatch_notification(&n);
484 if (mcelog_submit(&memory_record) != 0)
485 ERROR(MCELOG_PLUGIN ": Failed to submit memory errors");
486 memset(&memory_record, 0, sizeof(memory_record));
487 }
489 fclose(*pp_file);
490 *pp_file = NULL;
491 }
493 mcelog_thread_running = 0;
494 pthread_cleanup_pop(1);
495 return NULL;
496 }
498 static int mcelog_init(void) {
499 if (socket_adapter.reinit(&socket_adapter) != 0) {
500 ERROR(MCELOG_PLUGIN ": Cannot connect to client socket");
501 return -1;
502 }
504 if (plugin_thread_create(&g_mcelog_config.tid, NULL, poll_worker, NULL,
505 NULL) != 0) {
506 ERROR(MCELOG_PLUGIN ": Error creating poll thread.");
507 return -1;
508 }
509 return 0;
510 }
512 static int get_memory_machine_checks(void) {
513 static const char dump[] = "dump all bios\n";
514 int ret = socket_adapter.write(&socket_adapter, dump, sizeof(dump));
515 if (ret != 0)
516 ERROR(MCELOG_PLUGIN ": SENT DUMP REQUEST FAILED");
517 else
518 DEBUG(MCELOG_PLUGIN ": SENT DUMP REQUEST OK");
519 return ret;
520 }
522 static int mcelog_read(__attribute__((unused)) user_data_t *ud) {
523 DEBUG(MCELOG_PLUGIN ": %s", __FUNCTION__);
525 if (get_memory_machine_checks() != 0)
526 ERROR(MCELOG_PLUGIN ": MACHINE CHECK INFO NOT AVAILABLE");
528 return 0;
529 }
531 static int mcelog_shutdown(void) {
532 int ret = 0;
533 if (mcelog_thread_running) {
534 pthread_cancel(g_mcelog_config.tid);
535 if (pthread_join(g_mcelog_config.tid, NULL) != 0) {
536 ERROR(MCELOG_PLUGIN ": Stopping thread failed.");
537 ret = -1;
538 }
539 }
541 ret = socket_adapter.close(&socket_adapter) || ret;
542 pthread_rwlock_destroy(&(socket_adapter.lock));
543 return -ret;
544 }
546 void module_register(void) {
547 plugin_register_complex_config(MCELOG_PLUGIN, mcelog_config);
548 plugin_register_init(MCELOG_PLUGIN, mcelog_init);
549 plugin_register_complex_read(NULL, MCELOG_PLUGIN, mcelog_read, 0, NULL);
550 plugin_register_shutdown(MCELOG_PLUGIN, mcelog_shutdown);
551 }