--- /dev/null
+%TGIF 4.1.41-QPL
+state(0,37,100.000,108,0,1,4,1,0,1,1,0,0,0,0,1,0,'Helvetica',0,69120,0,0,0,10,0,1,1,1,0,16,0,0,1,1,1,1,1088,1408,1,0,2880,0).
+%
+% @(#)$Header: /src/cvsroot/gnucomo/doc/classes-gcm_input.obj,v 1.1 2002-10-05 10:23:08 arjen Exp $
+% %W%
+%
+unit("1 pixel/pixel").
+color_info(8,65535,0,[
+ "black", 0, 0, 0, 0, 0, 0, 1,
+ "red", 65535, 0, 0, 65535, 0, 0, 1,
+ "green", 0, 65535, 0, 0, 65535, 0, 1,
+ "blue", 0, 0, 65535, 0, 0, 65535, 1,
+ "magenta", 65535, 0, 65535, 65535, 0, 65535, 1,
+ "cyan", 0, 65535, 65535, 0, 65535, 65535, 1,
+ "white", 65535, 65535, 65535, 65535, 65535, 65535, 1,
+ "yellow", 65535, 65535, 0, 65535, 65535, 0, 1
+]).
+script_frac("0.6").
+fg_bg_colors('black','White').
+dont_reencode("FFDingbests:ZapfDingbats").
+page(1,"",1,'').
+box('black','',96,72,680,576,0,2,1,209,0,0,0,0,0,'2',0,[
+]).
+text('black',388,534,1,1,1,218,23,211,18,5,0,0,0,0,2,218,23,0,0,"",0,0,0,0,552,'',[
+minilines(218,23,0,0,1,0,0,[
+mini_line(218,18,5,0,0,0,[
+str_block(0,218,18,5,0,-1,0,0,0,[
+str_seg('black','Helvetica-Bold',1,103680,218,18,5,0,-1,0,0,0,0,0,
+ "gcm_input class diagram")])
+])
+])]).
+group([
+box('cyan','',352,436,480,500,1,1,1,848,0,0,0,0,0,'1',0,[
+]),
+text('black',416,448,1,1,1,71,15,849,12,3,0,0,0,0,2,71,15,0,0,"",0,0,0,0,460,'',[
+minilines(71,15,0,0,1,0,0,[
+mini_line(71,12,3,0,0,0,[
+str_block(0,71,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,71,12,3,0,-1,0,0,0,0,0,
+ "PgDatabase")])
+])
+])]),
+box('black','',352,436,480,500,0,1,1,867,0,0,0,0,0,'1',0,[
+])
+],
+880,0,0,[
+]).
+group([
+box('black','',124,100,252,164,0,1,1,882,0,0,0,0,0,'1',0,[
+]),
+text('black',188,112,1,1,1,96,15,883,12,3,0,0,0,0,2,96,15,0,0,"",0,0,0,0,124,'',[
+minilines(96,15,0,0,1,0,0,[
+mini_line(96,12,3,0,0,0,[
+str_block(0,96,12,3,0,0,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,96,12,3,0,0,0,0,0,0,0,
+ "message_buffer")])
+])
+])])
+],
+881,0,0,[
+]).
+group([
+box('cyan','',124,252,252,316,1,1,1,888,0,0,0,0,0,'1',0,[
+]),
+text('black',188,264,1,1,1,45,15,889,12,3,0,0,0,0,2,45,15,0,0,"",0,0,0,0,276,'',[
+minilines(45,15,0,0,1,0,0,[
+mini_line(45,12,3,0,0,0,[
+str_block(0,45,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,45,12,3,0,-1,0,0,0,0,0,
+ "istream")])
+])
+])]),
+box('black','',124,252,252,316,0,1,1,890,0,0,0,0,0,'1',0,[
+])
+],
+887,0,0,[
+]).
+group([
+box('black','',520,304,648,368,0,1,1,896,0,0,0,0,0,'1',0,[
+]),
+text('black',584,316,1,1,1,96,15,897,12,3,0,0,0,0,2,96,15,0,0,"",0,0,0,0,328,'',[
+minilines(96,15,0,0,1,0,0,[
+mini_line(96,12,3,0,0,0,[
+str_block(0,96,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,96,12,3,0,-1,0,0,0,0,0,
+ "gnucomo_config")])
+])
+])])
+],
+895,0,0,[
+]).
+poly('black','',2,[
+ 416,388,416,436],0,1,1,914,0,0,0,0,0,0,0,'1',0,0,
+ "0","",[
+ 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
+]).
+group([
+group([
+polygon('black','',5,[
+ 416,368,412,378,416,388,420,378,416,368],0,1,1,0,813,0,0,0,0,0,'1',0,
+ "00",[
+]),
+box('black','',412,372,420,384,0,1,0,814,0,0,0,0,0,'1',0,[
+attr("", "auto_center_attr", 0, 1, 0,
+text('black',416,371,1,1,1,144,23,815,18,5,0,0,0,0,2,144,23,0,0,"",0,0,0,0,389,'',[
+minilines(144,23,0,0,1,0,0,[
+mini_line(144,18,5,0,0,0,[
+str_block(0,144,18,5,0,0,0,0,0,[
+str_seg('black','Helvetica-Bold',1,103680,144,18,5,0,0,0,0,0,0,0,
+ "auto_center_attr")])
+])
+])])),
+attr("label=", "", 1, 0, 0,
+text('black',416,366,1,1,1,0,23,816,18,5,0,0,0,0,2,0,23,0,0,"",0,0,0,0,384,'',[
+minilines(0,23,0,0,1,0,0,[
+mini_line(0,18,5,0,0,0,[
+str_block(0,0,18,5,0,0,0,0,0,[
+str_seg('black','Helvetica-Bold',1,103680,0,18,5,0,0,0,0,0,0,0,
+ "")])
+])
+])]))
+])
+],
+817,0,0,[
+]),
+group([
+box('black','',352,304,480,368,0,1,1,824,0,0,0,0,0,'1',0,[
+]),
+text('black',416,316,1,1,1,114,15,825,12,3,0,0,0,0,2,114,15,0,0,"",0,0,0,0,328,'',[
+minilines(114,15,0,0,1,0,0,[
+mini_line(114,12,3,0,0,0,[
+str_block(0,114,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,114,12,3,0,-1,0,0,0,0,0,
+ "gnucomo_database")])
+])
+])])
+],
+823,0,0,[
+])
+],
+937,0,0,[
+]).
+poly('black','',2,[
+ 340,128,252,128],0,1,1,948,0,0,0,0,0,0,0,'1',0,0,
+ "0","",[
+ 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
+]).
+poly('black','',2,[
+ 184,164,184,252],0,1,1,970,0,0,0,0,0,0,0,'1',0,0,
+ "0","",[
+ 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
+]).
+poly('black','',2,[
+ 416,232,416,304],0,1,1,971,0,0,0,0,0,0,0,'1',0,0,
+ "0","",[
+ 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
+]).
+poly('black','',2,[
+ 480,332,520,332],0,1,1,972,0,0,0,0,0,0,0,'1',0,0,
+ "0","",[
+ 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
+]).
+group([
+group([
+box('black','',360,104,488,232,0,1,1,89,0,0,0,0,0,'1',0,[
+]),
+text('black',424,116,1,1,1,92,15,90,12,3,0,0,0,0,2,92,15,0,0,"",0,0,0,0,128,'',[
+minilines(92,15,0,0,1,0,0,[
+mini_line(92,12,3,0,0,0,[
+str_block(0,92,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,92,12,3,0,-1,0,0,0,0,0,
+ "client_message")])
+])
+])])
+],
+913,0,0,[
+]),
+group([
+polygon('black','',5,[
+ 350,124,340,128,350,132,360,128,350,124],0,1,1,0,960,0,0,0,0,0,'1',0,
+ "00",[
+]),
+box('black','',342,126,358,130,0,1,0,961,0,0,0,0,0,'1',0,[
+attr("", "auto_center_attr", 0, 1, 0,
+text('black',350,125,1,1,1,100,15,962,12,3,0,0,0,0,2,100,15,0,0,"",0,0,0,0,137,'',[
+minilines(100,15,0,0,1,0,0,[
+mini_line(100,12,3,0,0,0,[
+str_block(0,100,12,3,0,0,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,100,12,3,0,0,0,0,0,0,0,
+ "auto_center_attr")])
+])
+])])),
+attr("label=", "", 1, 0, 0,
+text('black',350,120,1,1,1,0,15,963,12,3,0,0,0,0,2,0,15,0,0,"",0,0,0,0,132,'',[
+minilines(0,15,0,0,1,0,0,[
+mini_line(0,12,3,0,0,0,[
+str_block(0,0,12,3,0,0,0,0,0,[
+str_seg('black','Helvetica-Bold',1,69120,0,12,3,0,0,0,0,0,0,0,
+ "")])
+])
+])]))
+])
+],
+964,0,0,[
+]),
+poly('black','',2,[
+ 360,136,488,136],0,1,1,974,0,0,0,0,0,0,0,'1',0,0,
+ "0","",[
+ 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
+]),
+text('black',364,136,1,0,1,53,15,975,12,3,0,0,0,0,2,53,15,0,0,"",0,0,0,0,148,'',[
+minilines(53,15,0,0,0,0,0,[
+mini_line(53,12,3,0,0,0,[
+str_block(0,53,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica',0,69120,53,12,3,0,-1,0,0,0,0,0,
+ "hostname")])
+])
+])]),
+text('black',364,148,1,0,1,64,15,977,12,3,0,0,0,0,2,64,15,0,0,"",0,0,0,0,160,'',[
+minilines(64,15,0,0,0,0,0,[
+mini_line(64,12,3,0,0,0,[
+str_block(0,64,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica',0,69120,64,12,3,0,-1,0,0,0,0,0,
+ "arrival_time")])
+])
+])]),
+text('black',364,184,1,0,1,50,15,979,12,3,0,0,0,0,2,50,15,0,0,"",0,0,0,0,196,'',[
+minilines(50,15,0,0,0,0,0,[
+mini_line(50,12,3,0,0,0,[
+str_block(0,50,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica',0,69120,50,12,3,0,-1,0,0,0,0,0,
+ "classify()")])
+])
+])]),
+text('black',364,200,1,0,1,36,15,981,12,3,0,0,0,0,2,36,15,0,0,"",0,0,0,0,212,'',[
+minilines(36,15,0,0,0,0,0,[
+mini_line(36,12,3,0,0,0,[
+str_block(0,36,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica',0,69120,36,12,3,0,-1,0,0,0,0,0,
+ "enter()")])
+])
+])]),
+poly('black','',2,[
+ 360,184,488,184],0,1,1,983,0,0,0,0,0,0,0,'1',0,0,
+ "0","",[
+ 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
+]),
+text('black',320,108,1,0,1,27,15,984,12,3,0,0,0,0,2,27,15,0,0,"",0,0,0,0,120,'',[
+minilines(27,15,0,0,0,0,0,[
+mini_line(27,12,3,0,0,0,[
+str_block(0,27,12,3,0,0,0,0,0,[
+str_seg('black','Helvetica',0,69120,27,12,3,0,0,0,0,0,0,0,
+ "input")])
+])
+])]),
+text('black',364,160,1,0,1,41,15,986,12,3,0,0,0,0,2,41,15,0,0,"",0,0,0,0,172,'',[
+minilines(41,15,0,0,0,0,0,[
+mini_line(41,12,3,0,0,0,[
+str_block(0,41,12,3,0,-1,0,0,0,[
+str_seg('black','Helvetica',0,69120,41,12,3,0,-1,0,0,0,0,0,
+ "service")])
+])
+])])
+],
+1006,0,0,[
+]).
<!--
XML documentation system
Original author : Arjen Baart - arjen@andromeda.nl
- Version : $Revision: 1.1 $
+ Version : $Revision: 1.2 $
This document is prepared for XMLDoc. Transform to HTML,
LaTeX, Postscript or plain text with XMLDoc utilities and
<book>
<titlepage>
<title>Gnucomo - Computer Monitoring</title>
+ <subtitle>Design description</subtitle>
+<!--
+ <para><picture src='logo.png' eps='logo' scale='0.7'/></para>
+-->
<author>Arjen Baart <code><arjen@andromeda.nl></code></author>
<author>Brenno de Winter<code><brenno@dewinter.com></code></author>
- <date>July 12, 2002</date>
+ <date>September 10, 2002</date>
<docinfo>
<infoitem label="Version">0.1</infoitem>
<infoitem label="Organization">Andromeda Technology & Automation</infoitem>
<para>
The architecture of <strong>gnucomo</strong> is shown in the
-dataflow diagram below:
+data flow diagram below:
</para>
<para>
- <picture src='dataflow.png' eps='dataflow.eps'/>
+ <picture src='dataflow.png' eps='dataflow' scale='0.7'/>
</para>
<para>
Architectural items to consider:
<itemize>
-<item>Active and passive data aquisition</item>
+<item>Active and passive data acquisition</item>
<item>Monitoring static and dynamic system parameters</item>
<item>Upper and lower limits for system parameters</item>
</itemize>
<heading>Database design</heading>
<para>
-Log entries are stored in a database with at least the following fields:
+The design of the database is described extensively in
+<reference href="manifest.html">the Manifest</reference>.
+Assuming development is done on the same system on which the real (production)
+gnucomo database is maintained, there is a need for a separate database
+on which to perform development and integration tests.
+Quite often, the test database will need to be destroyed and recreated.
+To enable testing of <strong>gnucomo</strong> applications, all programs
+need to access either the test database or the production database.
+To accommodate this, each application needs an option to override the
+default name of the configuration file (gnucomo.conf).
+</para>
+
+<para>
+To create a convenient programming interface for object oriented languages,
+a class <emph>gnucomo_database</emph> provides an abstract layer which
+hides the details of the database implementation.
+An object of this class maintains the connection to the database server
+and provides convenience functions for accessing information in the
+database.
+A constructor of the <emph>gnucomo_database</emph> is passed a reference to
+the <emph>gnucomo_configuration</emph> object in order to access the database.
+This accommodates for both production and test databases.
+The constructor will immediately try to connect to the database and check its
+validity.
+The destructor will of course close the database connection.
+</para>
+
+<para>
+Other methods provide access to the database.
+There will be lots more in the future, but here are a few to begin with:
<itemize>
-<item>hostname</item>
-<item>timestamp</item>
-<item>service (kernel, daemon, ...)</item>
-<item>Log message</item>
+<item>Find the objectid of a host, given its hostname</item>
+<item>Insert a log record into the log table</item>
</itemize>
</para>
</section>
<heading>Configuration</heading>
<para>
-Configurational parameters are stored in a XML formatted configuration file.
+Configuration parameters are stored in a XML formatted configuration file.
The config file contains a two-level hierarchy.
The first level denotes the section for which the parameter is used
and the second level is the parameter itself.
<heading>gnucomo_config class</heading>
<para>
-Each Gnucomo application should have exectly one object of the
-<strong>gnucomo_config</strong> to obtain its configurational
+Each Gnucomo application should have exactly one object of the
+<strong>gnucomo_config</strong> to obtain its configuration
parameters.
The following methods are supported in this class:
</subsection>
</section>
+<section>
+<heading>gcm_input</heading>
+
+<para>
+<strong>gcm_input</strong> is the application which captures messages from client
+systems in one form or another and tries to store information from these messages
+into the database.
+A client message may arrive in a number of forms and through any kind of
+transportation channel.
+Here are a few examples:
+
+<itemize>
+<item>Copied directly from a local client's file system.</item>
+<item>Copied remotely from a client's file system, e.g. using
+<code>ftp</code>, <code>rcp</code> or <code>scp</code>.</item>
+<item>Through an email.</item>
+</itemize>
+
+On top of that, any message may be encrypted, for example with PGP or GnuPG.
+In any of these situations, <strong>gcm_input</strong> should be able to extract
+as much information as possible from the client's message.
+In case the message is encrypted, it may not be possible to run <strong>gcm_input</strong>
+in the background, since human intervention is needed to enter the secret key.
+</para>
+<para>
+The primary function of <strong>gcm_input</strong> is to store lines from a client's log files
+into the <emph>log</emph> table.
+To do this, we need certain information about the client message that is usually not
+in the content of a log file.
+This information includes:
+<itemize>
+<item>The source of the log file, most often in the form of the client's hostname.</item>
+<item>The time stamp of the time on which the log file arrived on the server.</item>
+<item>The service on the client which produced the log file.</item>
+</itemize>
+
+Sometimes, this information is available from the message itself, as in an email header.
+On other occasions, the information needs to be supplied externally,
+e.g. by using command line options.
+</para>
+<para>
+Apart from determining information about the client's message, the content
+of the message needs to be analyzed in order to handle it properly.
+The body of the message may contain all sorts of information, such as:
+<itemize>
+<item>System log file</item>
+<item>Apache log file</item>
+<item>Report from a Gnucomo agent</item>
+<item>Something else...</item>
+</itemize>
+
+The message is analyzed to obtain information about what the message entails
+and where it came from.
+The <strong>classify()</strong> method tries to extract that information.
+Sometimes, this information can not be determined with absolute 100% certainty.
+The certainty expresses how sure we are about the contents in the message.
+Classifying a message may be performed with an algorithm as shown in
+the following pseudo code:
+
+<verbatim>
+while certainty < ε AND not at end
+
+ Scan for a marker
+
+ Adjust certainty
+</verbatim>
+
+Initially, a message is not classified and the certainty is 0.0.
+Some lines point toward a certain class of message but do not absolutely determine
+the class of a message. Other pieces of text are typical for a certain message class.
+Examples of markers that determine the classification of a client message
+are discussed below.
+
+<verbatim>
+From - Sat Sep 14 15:01:15 2002
+</verbatim>
+
+This is almost certainly a UNIX style mail header.
+There should be lines beginning with <code>From:</code> and <code>Date:</code>
+before the first empty line is encountered.
+The hostname of the client that sent the message and the time of arrival
+can be determined from these email header lines.
+The content of the message is still to be determined by matching
+other markers.
+
+<verbatim>
+-----BEGIN PGP MESSAGE-----
+</verbatim>
+
+Such a line in the message certainly means that the message is PGP or GnuPG
+encrypted.
+Decrypting is possible only if someone or something provides a secret key.
+
+<verbatim>
+Sep 1 04:20:00 kithira kernel: solo1: unloading
+</verbatim>
+
+The general pattern of a system log file is an abbreviated month name, a day,
+a time, a name of a host without the domain, the name of a service followed
+by a colon and finally, the message of that service.
+We can match this with a regular expression to see if the message holds syslog lines.
+Similar matches can be used to find Apache log lines or output from the <emph>dump</emph>
+backup program or anything else.
+</para>
+
+<para>
+The message classification embodies the way in which a message must be
+handled and in what way information from the message can be put into
+the database.
+Aspects for handling the message are for example:
+<itemize>
+<item>Strip lines at the beginning or end.</item>
+<item>Store each line separately or store the message as a whole.</item>
+<item>How to extract hostname, arrival time and service from the message.</item>
+<item>How to break up the message into individual fields for a <emph>log</emph> record.</item>
+</itemize>
+</para>
+
+<para>
+The figure below shows the class diagram that is used for <strong>gcm_input</strong>:
+ <para>
+ <picture src='classes-gcm_input.png' eps='classes-gcm_input'/>
+ </para>
+
+The heart of the application is a <emph>client_message</emph> object.
+This object reads the message through the <emph>message_buffer</emph> from some
+input stream (file, string, stdin or socket), classifies the message and
+enters information from the message into the database.
+It has a relationship with a <emph>gnucomo_database</emph> object which
+is an abstraction of the tables in the database.
+These are the methods for the <emph>client_message</emph> class:
+
+<itemize>
+<item>client_message::client_message(istream *in, gnucomo_database *db)
+ <para>
+ Constructor.
+ </para>
+</item>
+<item>double client_message::classify(String host, date arrival_d, hour arrival_t, String serv)
+ <para>
+ Try to classify the message and return the certainty with which the class of the
+ message could be determined.
+ If the hostname, arrival time and service can not be extracted from the message,
+ use the arguments as default.
+ </para>
+</item>
+<item>int enter()
+ <para>
+ Insert the message contents into the <emph>log</emph> table of the gnucomo
+ database.
+ Returns the number of records inserted.
+ </para>
+</item>
+</itemize>
+
+</para>
+<para>
+Some kind of input buffering is needed when a client message is being processed.
+The contents of the message are not entirely clear until a few lines are analyzed,
+and these lines probably need to be read again.
+When the message is stored in a file, this is no problem; a simple lseek allows us
+to read any line over and over again.
+However, when the message comes from an input stream, like a TCP socket or just
+plain old stdin, this is a different matter.
+Lines of the messages that are already read will be lost forever, unless they are
+stored somewhere.
+To store an input stream temporarily, there are two options:
+<enumerate>
+<item>In an internal memory buffer.</item>
+<item>In an external (temporary) file.</item>
+</enumerate>
+The <emph>message_buffer</emph> class takes care of the input buffering, thus
+hiding these implementation details.
+On the outside, a <emph>message_buffer</emph> can be read line by line until the
+end of the input is reached.
+Lines of input can be read again by backing up to the beginning of the message
+by using the <strong>rewind()</strong> method or by backing up one line
+with the <strong>--</strong> operator.
+The <emph>message_buffer</emph> object maintains a pointer to the next
+available line.
+The <strong>++</strong> operator, being the opposite of the <strong>--</strong>
+operator, skips one line.
+</para>
+
+<para>
+The <strong>>></strong> operator reads data from the message
+into the second (String) operand, just like the <strong>>></strong>
+operator for an istream.
+There is a small difference, though.
+The <strong>>></strong> operator for a <emph>message_buffer</emph>
+returns a boolean value which states if there actually was input available.
+This value will usually turn to <code>false</code> at the end of file.
+A second difference is the fact that input data can only be read into
+<emph>String</emph> objects a line at a time.
+There are no functions for reading integer or floating point numbers.
+The <strong>>></strong> operator reads the next line either from
+an internal buffer or from the external input stream if the internal
+buffer is exhausted.
+Lines read from the input stream are cached in the internal buffer,
+so they are available for reading another time, e.g. after
+rewinding to the beginning of the message.
+</para>
+
+<para>
+Methods for the <emph>message_buffer</emph> class:
+
+<itemize>
+<item>message_buffer::message_buffer(istream *in)</item>
+ <para>
+ Constructor.
+ </para>
+<item>bool operator >>(message_buffer &, String &)
+</item>
+<item>message_buffer::rewind()</item>
+<item>message_buffer::operator --</item>
+<item>message_buffer::operator ++</item>
+</itemize>
+</para>
+<subsection>
+<heading>Command arguments</heading>
+
+<para>
+<strong>Gcm_input</strong> understands the following command line arguments:
+<itemize>
+<item>-c <name> : Configuration name</item>
+<item>-d <date> : Arrival time of the message</item>
+<item>-h <hostname> : FQDN of the client</item>
+<item>-s <service> : service that created the log</item>
+<item>-v : verbose output. Print lots of debug information</item>
+<item>-V : Print version and exit</item>
+</itemize>
+</para>
+</subsection>
+</section>
+
<section>
<heading>Design ideas</heading>
<para>
-Use of a neural network to analyse system logs:
+Use of a neural network to analyze system logs:
<itemize>
<item>Classify words</item>
<item>Classify message based on word classification</item>