From 3771cd093ba6d9ed16824c2be83e89d128b07aa9 Mon Sep 17 00:00:00 2001 From: arjen Date: Sat, 5 Oct 2002 10:23:08 +0000 Subject: [PATCH] Added design description of gcm_input --- doc/classes-gcm_input.obj | 266 +++++++++++++++++++++++++++++++++++++++++ doc/design.xml | 294 +++++++++++++++++++++++++++++++++++++++++++--- doc/makefile | 33 ++++-- 3 files changed, 571 insertions(+), 22 deletions(-) create mode 100644 doc/classes-gcm_input.obj diff --git a/doc/classes-gcm_input.obj b/doc/classes-gcm_input.obj new file mode 100644 index 0000000..9f058f8 --- /dev/null +++ b/doc/classes-gcm_input.obj @@ -0,0 +1,266 @@ +%TGIF 4.1.41-QPL +state(0,37,100.000,108,0,1,4,1,0,1,1,0,0,0,0,1,0,'Helvetica',0,69120,0,0,0,10,0,1,1,1,0,16,0,0,1,1,1,1,1088,1408,1,0,2880,0). +% +% @(#)$Header: /src/cvsroot/gnucomo/doc/classes-gcm_input.obj,v 1.1 2002-10-05 10:23:08 arjen Exp $ +% %W% +% +unit("1 pixel/pixel"). +color_info(8,65535,0,[ + "black", 0, 0, 0, 0, 0, 0, 1, + "red", 65535, 0, 0, 65535, 0, 0, 1, + "green", 0, 65535, 0, 0, 65535, 0, 1, + "blue", 0, 0, 65535, 0, 0, 65535, 1, + "magenta", 65535, 0, 65535, 65535, 0, 65535, 1, + "cyan", 0, 65535, 65535, 0, 65535, 65535, 1, + "white", 65535, 65535, 65535, 65535, 65535, 65535, 1, + "yellow", 65535, 65535, 0, 65535, 65535, 0, 1 +]). +script_frac("0.6"). +fg_bg_colors('black','White'). +dont_reencode("FFDingbests:ZapfDingbats"). +page(1,"",1,''). +box('black','',96,72,680,576,0,2,1,209,0,0,0,0,0,'2',0,[ +]). +text('black',388,534,1,1,1,218,23,211,18,5,0,0,0,0,2,218,23,0,0,"",0,0,0,0,552,'',[ +minilines(218,23,0,0,1,0,0,[ +mini_line(218,18,5,0,0,0,[ +str_block(0,218,18,5,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,103680,218,18,5,0,-1,0,0,0,0,0, + "gcm_input class diagram")]) +]) +])]). +group([ +box('cyan','',352,436,480,500,1,1,1,848,0,0,0,0,0,'1',0,[ +]), +text('black',416,448,1,1,1,71,15,849,12,3,0,0,0,0,2,71,15,0,0,"",0,0,0,0,460,'',[ +minilines(71,15,0,0,1,0,0,[ +mini_line(71,12,3,0,0,0,[ +str_block(0,71,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,71,12,3,0,-1,0,0,0,0,0, + "PgDatabase")]) +]) +])]), +box('black','',352,436,480,500,0,1,1,867,0,0,0,0,0,'1',0,[ +]) +], +880,0,0,[ +]). +group([ +box('black','',124,100,252,164,0,1,1,882,0,0,0,0,0,'1',0,[ +]), +text('black',188,112,1,1,1,96,15,883,12,3,0,0,0,0,2,96,15,0,0,"",0,0,0,0,124,'',[ +minilines(96,15,0,0,1,0,0,[ +mini_line(96,12,3,0,0,0,[ +str_block(0,96,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,96,12,3,0,0,0,0,0,0,0, + "message_buffer")]) +]) +])]) +], +881,0,0,[ +]). +group([ +box('cyan','',124,252,252,316,1,1,1,888,0,0,0,0,0,'1',0,[ +]), +text('black',188,264,1,1,1,45,15,889,12,3,0,0,0,0,2,45,15,0,0,"",0,0,0,0,276,'',[ +minilines(45,15,0,0,1,0,0,[ +mini_line(45,12,3,0,0,0,[ +str_block(0,45,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,45,12,3,0,-1,0,0,0,0,0, + "istream")]) +]) +])]), +box('black','',124,252,252,316,0,1,1,890,0,0,0,0,0,'1',0,[ +]) +], +887,0,0,[ +]). +group([ +box('black','',520,304,648,368,0,1,1,896,0,0,0,0,0,'1',0,[ +]), +text('black',584,316,1,1,1,96,15,897,12,3,0,0,0,0,2,96,15,0,0,"",0,0,0,0,328,'',[ +minilines(96,15,0,0,1,0,0,[ +mini_line(96,12,3,0,0,0,[ +str_block(0,96,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,96,12,3,0,-1,0,0,0,0,0, + "gnucomo_config")]) +]) +])]) +], +895,0,0,[ +]). +poly('black','',2,[ + 416,388,416,436],0,1,1,914,0,0,0,0,0,0,0,'1',0,0, + "0","",[ + 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ +]). +group([ +group([ +polygon('black','',5,[ + 416,368,412,378,416,388,420,378,416,368],0,1,1,0,813,0,0,0,0,0,'1',0, + "00",[ +]), +box('black','',412,372,420,384,0,1,0,814,0,0,0,0,0,'1',0,[ +attr("", "auto_center_attr", 0, 1, 0, +text('black',416,371,1,1,1,144,23,815,18,5,0,0,0,0,2,144,23,0,0,"",0,0,0,0,389,'',[ +minilines(144,23,0,0,1,0,0,[ +mini_line(144,18,5,0,0,0,[ +str_block(0,144,18,5,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,103680,144,18,5,0,0,0,0,0,0,0, + "auto_center_attr")]) +]) +])])), +attr("label=", "", 1, 0, 0, +text('black',416,366,1,1,1,0,23,816,18,5,0,0,0,0,2,0,23,0,0,"",0,0,0,0,384,'',[ +minilines(0,23,0,0,1,0,0,[ +mini_line(0,18,5,0,0,0,[ +str_block(0,0,18,5,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,103680,0,18,5,0,0,0,0,0,0,0, + "")]) +]) +])])) +]) +], +817,0,0,[ +]), +group([ +box('black','',352,304,480,368,0,1,1,824,0,0,0,0,0,'1',0,[ +]), +text('black',416,316,1,1,1,114,15,825,12,3,0,0,0,0,2,114,15,0,0,"",0,0,0,0,328,'',[ +minilines(114,15,0,0,1,0,0,[ +mini_line(114,12,3,0,0,0,[ +str_block(0,114,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,114,12,3,0,-1,0,0,0,0,0, + "gnucomo_database")]) +]) +])]) +], +823,0,0,[ +]) +], +937,0,0,[ +]). +poly('black','',2,[ + 340,128,252,128],0,1,1,948,0,0,0,0,0,0,0,'1',0,0, + "0","",[ + 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ +]). +poly('black','',2,[ + 184,164,184,252],0,1,1,970,0,0,0,0,0,0,0,'1',0,0, + "0","",[ + 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ +]). +poly('black','',2,[ + 416,232,416,304],0,1,1,971,0,0,0,0,0,0,0,'1',0,0, + "0","",[ + 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ +]). +poly('black','',2,[ + 480,332,520,332],0,1,1,972,0,0,0,0,0,0,0,'1',0,0, + "0","",[ + 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ +]). +group([ +group([ +box('black','',360,104,488,232,0,1,1,89,0,0,0,0,0,'1',0,[ +]), +text('black',424,116,1,1,1,92,15,90,12,3,0,0,0,0,2,92,15,0,0,"",0,0,0,0,128,'',[ +minilines(92,15,0,0,1,0,0,[ +mini_line(92,12,3,0,0,0,[ +str_block(0,92,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,92,12,3,0,-1,0,0,0,0,0, + "client_message")]) +]) +])]) +], +913,0,0,[ +]), +group([ +polygon('black','',5,[ + 350,124,340,128,350,132,360,128,350,124],0,1,1,0,960,0,0,0,0,0,'1',0, + "00",[ +]), +box('black','',342,126,358,130,0,1,0,961,0,0,0,0,0,'1',0,[ +attr("", "auto_center_attr", 0, 1, 0, +text('black',350,125,1,1,1,100,15,962,12,3,0,0,0,0,2,100,15,0,0,"",0,0,0,0,137,'',[ +minilines(100,15,0,0,1,0,0,[ +mini_line(100,12,3,0,0,0,[ +str_block(0,100,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,100,12,3,0,0,0,0,0,0,0, + "auto_center_attr")]) +]) +])])), +attr("label=", "", 1, 0, 0, +text('black',350,120,1,1,1,0,15,963,12,3,0,0,0,0,2,0,15,0,0,"",0,0,0,0,132,'',[ +minilines(0,15,0,0,1,0,0,[ +mini_line(0,12,3,0,0,0,[ +str_block(0,0,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,0,12,3,0,0,0,0,0,0,0, + "")]) +]) +])])) +]) +], +964,0,0,[ +]), +poly('black','',2,[ + 360,136,488,136],0,1,1,974,0,0,0,0,0,0,0,'1',0,0, + "0","",[ + 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ +]), +text('black',364,136,1,0,1,53,15,975,12,3,0,0,0,0,2,53,15,0,0,"",0,0,0,0,148,'',[ +minilines(53,15,0,0,0,0,0,[ +mini_line(53,12,3,0,0,0,[ +str_block(0,53,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica',0,69120,53,12,3,0,-1,0,0,0,0,0, + "hostname")]) +]) +])]), +text('black',364,148,1,0,1,64,15,977,12,3,0,0,0,0,2,64,15,0,0,"",0,0,0,0,160,'',[ +minilines(64,15,0,0,0,0,0,[ +mini_line(64,12,3,0,0,0,[ +str_block(0,64,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica',0,69120,64,12,3,0,-1,0,0,0,0,0, + "arrival_time")]) +]) +])]), +text('black',364,184,1,0,1,50,15,979,12,3,0,0,0,0,2,50,15,0,0,"",0,0,0,0,196,'',[ +minilines(50,15,0,0,0,0,0,[ +mini_line(50,12,3,0,0,0,[ +str_block(0,50,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica',0,69120,50,12,3,0,-1,0,0,0,0,0, + "classify()")]) +]) +])]), +text('black',364,200,1,0,1,36,15,981,12,3,0,0,0,0,2,36,15,0,0,"",0,0,0,0,212,'',[ +minilines(36,15,0,0,0,0,0,[ +mini_line(36,12,3,0,0,0,[ +str_block(0,36,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica',0,69120,36,12,3,0,-1,0,0,0,0,0, + "enter()")]) +]) +])]), +poly('black','',2,[ + 360,184,488,184],0,1,1,983,0,0,0,0,0,0,0,'1',0,0, + "0","",[ + 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ +]), +text('black',320,108,1,0,1,27,15,984,12,3,0,0,0,0,2,27,15,0,0,"",0,0,0,0,120,'',[ +minilines(27,15,0,0,0,0,0,[ +mini_line(27,12,3,0,0,0,[ +str_block(0,27,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica',0,69120,27,12,3,0,0,0,0,0,0,0, + "input")]) +]) +])]), +text('black',364,160,1,0,1,41,15,986,12,3,0,0,0,0,2,41,15,0,0,"",0,0,0,0,172,'',[ +minilines(41,15,0,0,0,0,0,[ +mini_line(41,12,3,0,0,0,[ +str_block(0,41,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica',0,69120,41,12,3,0,-1,0,0,0,0,0, + "service")]) +]) +])]) +], +1006,0,0,[ +]). diff --git a/doc/design.xml b/doc/design.xml index f14ef8f..e00046b 100644 --- a/doc/design.xml +++ b/doc/design.xml @@ -6,7 +6,7 @@ Arjen Baart <arjen@andromeda.nl> Brenno de Winter<brenno@dewinter.com> - July 12, 2002 + September 10, 2002 0.1 Andromeda Technology & Automation @@ -46,17 +50,17 @@ and is based upon the development manifest. The architecture of gnucomo is shown in the -dataflow diagram below: +data flow diagram below: - + Architectural items to consider: -Active and passive data aquisition +Active and passive data acquisition Monitoring static and dynamic system parameters Upper and lower limits for system parameters @@ -81,12 +85,39 @@ database as described in the manifest. Database design -Log entries are stored in a database with at least the following fields: +The design of the database is described extensively in +the Manifest. +Assuming development is done on the same system on which the real (production) +gnucomo database is maintained, there is a need for a separate database +on which to perform development and integration tests. +Quite often, the test database will need to be destroyed and recreated. +To enable testing of gnucomo applications, all programs +need to access either the test database or the production database. +To accommodate this, each application needs an option to override the +default name of the configuration file (gnucomo.conf). + + + +To create a convenient programming interface for object oriented languages, +a class gnucomo_database provides an abstract layer which +hides the details of the database implementation. +An object of this class maintains the connection to the database server +and provides convenience functions for accessing information in the +database. +A constructor of the gnucomo_database is passed a reference to +the gnucomo_configuration object in order to access the database. +This accommodates for both production and test databases. +The constructor will immediately try to connect to the database and check its +validity. +The destructor will of course close the database connection. + + + +Other methods provide access to the database. +There will be lots more in the future, but here are a few to begin with: -hostname -timestamp -service (kernel, daemon, ...) -Log message +Find the objectid of a host, given its hostname +Insert a log record into the log table @@ -95,7 +126,7 @@ Log entries are stored in a database with at least the following fields: Configuration -Configurational parameters are stored in a XML formatted configuration file. +Configuration parameters are stored in a XML formatted configuration file. The config file contains a two-level hierarchy. The first level denotes the section for which the parameter is used and the second level is the parameter itself. @@ -137,8 +168,8 @@ Other database systems are not supported yet. gnucomo_config class -Each Gnucomo application should have exectly one object of the -gnucomo_config to obtain its configurational +Each Gnucomo application should have exactly one object of the +gnucomo_config to obtain its configuration parameters. The following methods are supported in this class: @@ -175,10 +206,245 @@ The following methods are supported in this class:
+gcm_input + + +gcm_input is the application which captures messages from client +systems in one form or another and tries to store information from these messages +into the database. +A client message may arrive in a number of forms and through any kind of +transportation channel. +Here are a few examples: + + +Copied directly from a local client's file system. +Copied remotely from a client's file system, e.g. using +ftp, rcp or scp. +Through an email. + + +On top of that, any message may be encrypted, for example with PGP or GnuPG. +In any of these situations, gcm_input should be able to extract +as much information as possible from the client's message. +In case the message is encrypted, it may not be possible to run gcm_input +in the background, since human intervention is needed to enter the secret key. + + +The primary function of gcm_input is to store lines from a client's log files +into the log table. +To do this, we need certain information about the client message that is usually not +in the content of a log file. +This information includes: + +The source of the log file, most often in the form of the client's hostname. +The time stamp of the time on which the log file arrived on the server. +The service on the client which produced the log file. + + +Sometimes, this information is available from the message itself, as in an email header. +On other occasions, the information needs to be supplied externally, +e.g. by using command line options. + + +Apart from determining information about the client's message, the content +of the message needs to be analyzed in order to handle it properly. +The body of the message may contain all sorts of information, such as: + +System log file +Apache log file +Report from a Gnucomo agent +Something else... + + +The message is analyzed to obtain information about what the message entails +and where it came from. +The classify() method tries to extract that information. +Sometimes, this information can not be determined with absolute 100% certainty. +The certainty expresses how sure we are about the contents in the message. +Classifying a message may be performed with an algorithm as shown in +the following pseudo code: + + +while certainty < ε AND not at end + + Scan for a marker + + Adjust certainty + + +Initially, a message is not classified and the certainty is 0.0. +Some lines point toward a certain class of message but do not absolutely determine +the class of a message. Other pieces of text are typical for a certain message class. +Examples of markers that determine the classification of a client message +are discussed below. + + +From - Sat Sep 14 15:01:15 2002 + + +This is almost certainly a UNIX style mail header. +There should be lines beginning with From: and Date: +before the first empty line is encountered. +The hostname of the client that sent the message and the time of arrival +can be determined from these email header lines. +The content of the message is still to be determined by matching +other markers. + + +-----BEGIN PGP MESSAGE----- + + +Such a line in the message certainly means that the message is PGP or GnuPG +encrypted. +Decrypting is possible only if someone or something provides a secret key. + + +Sep 1 04:20:00 kithira kernel: solo1: unloading + + +The general pattern of a system log file is an abbreviated month name, a day, +a time, a name of a host without the domain, the name of a service followed +by a colon and finally, the message of that service. +We can match this with a regular expression to see if the message holds syslog lines. +Similar matches can be used to find Apache log lines or output from the dump +backup program or anything else. + + + +The message classification embodies the way in which a message must be +handled and in what way information from the message can be put into +the database. +Aspects for handling the message are for example: + +Strip lines at the beginning or end. +Store each line separately or store the message as a whole. +How to extract hostname, arrival time and service from the message. +How to break up the message into individual fields for a log record. + + + + +The figure below shows the class diagram that is used for gcm_input: + + + + +The heart of the application is a client_message object. +This object reads the message through the message_buffer from some +input stream (file, string, stdin or socket), classifies the message and +enters information from the message into the database. +It has a relationship with a gnucomo_database object which +is an abstraction of the tables in the database. +These are the methods for the client_message class: + + +client_message::client_message(istream *in, gnucomo_database *db) + + Constructor. + + +double client_message::classify(String host, date arrival_d, hour arrival_t, String serv) + + Try to classify the message and return the certainty with which the class of the + message could be determined. + If the hostname, arrival time and service can not be extracted from the message, + use the arguments as default. + + +int enter() + + Insert the message contents into the log table of the gnucomo + database. + Returns the number of records inserted. + + + + + + +Some kind of input buffering is needed when a client message is being processed. +The contents of the message are not entirely clear until a few lines are analyzed, +and these lines probably need to be read again. +When the message is stored in a file, this is no problem; a simple lseek allows us +to read any line over and over again. +However, when the message comes from an input stream, like a TCP socket or just +plain old stdin, this is a different matter. +Lines of the messages that are already read will be lost forever, unless they are +stored somewhere. +To store an input stream temporarily, there are two options: + +In an internal memory buffer. +In an external (temporary) file. + +The message_buffer class takes care of the input buffering, thus +hiding these implementation details. +On the outside, a message_buffer can be read line by line until the +end of the input is reached. +Lines of input can be read again by backing up to the beginning of the message +by using the rewind() method or by backing up one line +with the -- operator. +The message_buffer object maintains a pointer to the next +available line. +The ++ operator, being the opposite of the -- +operator, skips one line. + + + +The >> operator reads data from the message +into the second (String) operand, just like the >> +operator for an istream. +There is a small difference, though. +The >> operator for a message_buffer +returns a boolean value which states if there actually was input available. +This value will usually turn to false at the end of file. +A second difference is the fact that input data can only be read into +String objects a line at a time. +There are no functions for reading integer or floating point numbers. +The >> operator reads the next line either from +an internal buffer or from the external input stream if the internal +buffer is exhausted. +Lines read from the input stream are cached in the internal buffer, +so they are available for reading another time, e.g. after +rewinding to the beginning of the message. + + + +Methods for the message_buffer class: + + +message_buffer::message_buffer(istream *in) + + Constructor. + +bool operator >>(message_buffer &, String &) + +message_buffer::rewind() +message_buffer::operator -- +message_buffer::operator ++ + + + +Command arguments + + +Gcm_input understands the following command line arguments: + +-c <name> : Configuration name +-d <date> : Arrival time of the message +-h <hostname> : FQDN of the client +-s <service> : service that created the log +-v : verbose output. Print lots of debug information +-V : Print version and exit + + + +
+ +
Design ideas -Use of a neural network to analyse system logs: +Use of a neural network to analyze system logs: Classify words Classify message based on word classification diff --git a/doc/makefile b/doc/makefile index 1f37e6e..ca1f9a4 100644 --- a/doc/makefile +++ b/doc/makefile @@ -7,25 +7,27 @@ .obj.eps: tgif -print -eps -color $< -XMLS = manifest.xml +XMLS = manifest.xml design.xml IMAGES= dataflow.png architecture.png erd-action.png erd-anu.png erd-log.png \ erd-lognotif.png erd-notif.png erd-object.png erd-objissue.png \ erd-objprior.png erd-objservice.png erd-objsysusr.png erd-objusr.png \ erd-prior.png erd-service.png erd-status.png erd-toi.png \ - erd-unplog.png erd-usr.png erd.png + erd-unplog.png erd-usr.png erd.png \ + classes-gcm_input.png PICTURES= dataflow.eps architecture.eps erd-action.eps erd-anu.eps erd-log.eps \ erd-lognotif.eps erd-notif.eps erd-object.eps erd-objissue.eps \ erd-objprior.eps erd-objservice.eps erd-objsysusr.eps erd-objusr.eps \ erd-prior.eps erd-service.eps erd-status.eps erd-toi.eps \ - erd-unplog.eps erd-usr.eps erd.eps + erd-unplog.eps erd-usr.eps erd.eps \ + classes-gcm_input.eps -html: manifest.html +html: manifest.html design.html -ps: manifest.ps +ps: manifest.ps design.ps -txt: manifest.txt +txt: manifest.txt design.txt all: ps html txt @@ -44,9 +46,24 @@ manifest.pdf : $(XMLS) $(PICTURES) manifest.txt : $(XMLS) xml2text manifest.xml > manifest.txt +design.html : $(XMLS) $(IMAGES) + xml2html design.xml > design.html + +design.ps : $(XMLS) $(PICTURES) + xml2latex design.xml >design.tex + latex design.tex + dvips -o design.ps design.dvi + +design.pdf : $(XMLS) $(PICTURES) + xml2latex design.xml >design.tex + pdflatex design.tex + +design.txt : $(XMLS) + xml2text design.xml > design.txt + check: xmllint --noout --valid $(XMLS) clean: - rm -f manifest.html manifest.ps manifest.tex manifest.dvi - rm -f manifest.log manifest.txt manifest.pdf + rm -f *.html *.ps *.tex *.dvi *.toc *.aux + rm -f *.log *.txt *.pdf rm -f $(IMAGES) $(PICTURES) -- 2.11.0