On Mon, Jul 31, 2006 at 10:55:50AM -0400, Michael Frey wrote:
[snip]
ok, the attached patch should let you read the history file again.
It cannot bring back the events that were lost, of course - but at least
you should be able to view the history.
It also all happened at the same time, and not for all hosts or tests,
although conn seems to be the hardest hit. Every host I have looked at,
had at least one Hostory graph failure; except for any new host eneterd
after July 23rd.
I cannot see how that could happen, but just in case the patch modifies
the history module to not log anything that doesn't have a valid color
code.
The time when this happened, would it happen to concide with any
cron-jobs, disk intensive activity, or anything else that might cause
file operations to fail or go slow ?
Regards,
Henrik
-------------- next part --------------
--- lib/availability.c 2006/07/20 16:06:41 1.42
+++ lib/availability.c 2006/07/31 15:28:39
@@ -219,6 +219,44 @@
return strdup(cause);
}
+static char *get_historyline(char *buf, int bufsize, FILE *fd, int *err,
+ char *colstr, unsigned int *start, unsigned int *duration, int *scanres)
+{
+ int ok;
• + do {
+ ok = 1;
• + if (fgets(buf, bufsize, fd) == NULL) {
+ return NULL;
+ }
• + if (strlen(buf) < 25) {
+ ok = 0;
+ *err += 1;
+ dbgprintf("Bad history line (short): %s\n", buf);
+ continue;
+ }
• + *scanres = sscanf(buf+25, "%s %u %u", colstr, start, duration);
+ if (*scanres < 2) {
+ ok = 0;
+ *err += 1;
+ dbgprintf("Bad history line (missing items): %s\n", buf);
+ continue;
+ }
• + if (parse_color(colstr) == -1) {
+ ok = 0;
+ *err += 1;
+ dbgprintf("Bad history line (bad color string): %s\n", buf);
+ continue;
+ }
+ } while (!ok);
• + return buf;
+}
• static int scan_historyfile(FILE *fd, time_t fromtime, time_t totime,
char *buf, size_t bufsize,
time_t *starttime, time_t *duration, char *colstr)
@@ -235,9 +273,14 @@
/* Is start of history after our report-end time ? */
rewind(fd);
- fgets(buf, bufsize, fd);
- if (sscanf(buf+25, "%s %u %u", colstr, &uistart, &uidur) == 2)
- uidur = time(NULL)-uistart;
+ if (!get_historyline(buf, bufsize, fd, &err, colstr, &uistart, &uidur, &scanres)) {
+ *starttime = time(NULL);
+ *duration = 0;
+ strcpy(colstr, "clear");
+ return err;
+ }
• + if (scanres == 2) uidur = time(NULL)-uistart;
start = uistart; dur = uidur;
if (start > totime) {
@@ -249,8 +292,7 @@
/* First, do a quick scan through the file to find the approximate position where we should start */
while ((start+dur) < fromtime) {
- if (fgets(buf, bufsize, fd)) {
- scanres = sscanf(buf+25, "%s %u %u", colstr, &uistart, &uidur);
+ if (get_historyline(buf, bufsize, fd, &err, colstr, &uistart, &uidur, &scanres)) {
start = uistart; dur = uidur;
if (scanres == 2) dur = time(NULL) - start;
@@ -262,11 +304,6 @@
fgets(buf, bufsize, fd); /* Skip partial line */
}
}
- else {
- err++;
- dbgprintf("Bad line in history file '%s'\n", buf);
- start = dur = 0; /* Try next line */
- }
}
else {
start = time(NULL);
@@ -284,19 +321,11 @@
/* Read one line at a time until we hit start of our report period */
do {
- if (fgets(buf, bufsize, fd)) {
- scanres = sscanf(buf+25, "%s %u %u", colstr, &uistart, &uidur);
+ if (get_historyline(buf, bufsize, fd, &err, colstr, &uistart, &uidur, &scanres)) {
start = uistart; dur = uidur;
if (scanres == 2) dur = time(NULL) - start;
- if (scanres < 2) {
- err++;
- dbgprintf("Bad line in history file '%s'\n", buf);
- start = dur = 0; /* Try next line */
- }
- else {
- dbgprintf("Got entry starting %lu lasting %lu\n", start, dur);
- }
+ dbgprintf("Got entry starting %lu lasting %lu\n", start, dur);
}
else {
start = time(NULL);
@@ -350,7 +379,7 @@
unsigned int uistart, uidur;
char colstr[MAX_LINE_LEN];
int color, done, i, scanres;
- int fileerrors;
+ int fileerrors = 0;
repinfo->fstate = "OK";
repinfo->withreport = 0;
@@ -377,11 +406,15 @@
}
else {
/* Already positioned (probably in a pipe) */
- fgets(l, sizeof(l), fd);
- scanres = sscanf(l+25, "%s %u %u", colstr, &uistart, &uidur);
- starttime = uistart; duration = uidur;
- if (scanres == 2) duration = time(NULL) - starttime;
- fileerrors = 0;
+ if (get_historyline(l, sizeof(l), fd, &fileerrors, colstr, &uistart, &uidur, &scanres)) {
+ starttime = uistart; duration = uidur;
+ if (scanres == 2) duration = time(NULL) - starttime;
+ }
+ else {
+ starttime = time(NULL); duration = 0;
+ strcpy(colstr, "clear");
+ fileerrors = 1;
+ }
}
if (starttime > totime) {
@@ -437,8 +470,7 @@
}
if ((starttime + duration) < totime) {
- if (fgets(l, sizeof(l), fd)) {
- scanres = sscanf(l+25, "%s %u %u", colstr, &uistart, &uidur);
+ if (get_historyline(l, sizeof(l), fd, &fileerrors, colstr, &uistart, &uidur, &scanres)) {
starttime = uistart; duration = uidur;
if (scanres == 2) duration = time(NULL) - starttime;
}
--- hobbitd/hobbitd_history.c 2006/05/25 21:04:44 1.46
+++ hobbitd/hobbitd_history.c 2006/07/31 15:44:15
@@ -167,6 +167,11 @@
downtimeactive = (atoi(items[12]) > 0);
clienttstamp = atoi(items[13]);
+ if (newcolor == -1) {
+ errprintf("Bad message: newcolor is unknown '%s'\n", items[7]);
+ continue;
+ }
• p = hostnamecommas = strdup(hostname); while ((p = strchr(p, '.')) != NULL) *p = ',';
if (save_statusevents) {
@@ -224,7 +229,8 @@
/* Sun Oct 10 06:49:42 2004 red 1097383782 602 */
if ((strlen(l) > 24) &&
- (sscanf(l+24, " %s %d %d", oldcol, &lastchg_i, &dur_i) == 2)) {
+ (sscanf(l+24, " %s %d %d", oldcol, &lastchg_i, &dur_i) == 2) &&
+ (parse_color(oldcol) != -1)) {
/*
* Record the start location of the line
*/
@@ -262,7 +268,11 @@
* Logfile does not exist.
*/
lastchg = tstamp;
- statuslogfd = fopen(statuslogfn, "w");
+ statuslogfd = fopen(statuslogfn, "a");
+ if (statuslogfd == NULL) {
+ errprintf("Cannot open status historyfile '%s' : %s\n",
+ statuslogfn, strerror(errno));
+ }
}
if (strcmp(oldcol, colorname(newcolor)) == 0) {
@@ -300,10 +310,6 @@
fclose(statuslogfd);
}
- else {
- errprintf("Cannot open status historyfile '%s' : %s\n",
- statuslogfn, strerror(errno));
- }
MEMUNDEFINE(statuslogfn);
MEMUNDEFINE(oldcol);