Commit a36f7531 authored by Jean-Benoist Leger's avatar Jean-Benoist Leger
Browse files

initial commit

parents
Author :
Jean-Benoist Leger <jleger@agroparistech.fr>, INRA, France
Contributor:
Pierre Barbillon <barbillon@agroparistech.fr>, AgroParisTech, France
This diff is collapsed.
Dependencies
************
* GNU make
* GCC - GNU Compilers Collection including C++ (g++) compiler
* Boost Serialization Library version >=1.49
* Boost Iostreams Library version >=1.49
* Boost Thread Library version >=1.49
* Boost Timer Library version >=1.49
* Boost System Library version >=1.49
* IT++ (itpp) Library
For Debian and Debian based distribution, install the following packages to
provide dependencies :
make g++ libboost-iostreams-dev libboost-serialization-dev libboost-thread-dev
libboost-timer-dev libboost-system-dev libitpp-dev
Build and install
*****************
You can choose the installation PREFIX (default: /usr/local), wmixnet program is
installed in PREFIX/bin/, manual un PREFIX/share/man/man1, scripts to create spm
file in PREFIX/share/doc/wmixnet/
Two examples are given.
To system installation, use the following steps :
* ./configure
* make
* (as root) make install
To user's home installation, use the following steps :
* ./configure --prefix=$HOME/local
* make
* make install
AUTOMAKE_OPTIONS = foreign
SUBDIRS = src man tools
Introduction
************
wmixnet is a implementation of Variationnal EM estimation on Stochastic Block
Model Graphs for various prbabilities laws.
Copyright
*********
wmixnet is written by Jean-Benoist Leger, INRA (France), and published under
GNU GPL v3 licence, see COPYING.
Build and install
*****************
See INSTALL.
#!/bin/sh
sed -r "s/xxxVERSIONxxx/$(cat VERSION)/g" -i man/genman.sh
sed -r "s/xxxVERSIONxxx/$(cat VERSION)/g" -i configure.ac
sh man/genman.sh
aclocal
autoconf
automake --copy --add-missing
#rm -r autom4te.cache/
#rm aclocal.m4
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.68])
AC_INIT([wmixnet], [xxxVERSIONxxx], [jleger@agroapristech.fr])
AM_INIT_AUTOMAKE([wmixnet], [xxxVERSIONxxx])
AC_CONFIG_SRCDIR([src/wmixnet.cc])
AC_CONFIG_HEADERS([src/config.h])
# Checks for programs.
AC_PROG_CXX
AC_PROG_CC
AC_LANG_PUSH([C++])
# Checks for libraries.
AC_CHECK_LIB([boost_iostreams], [main],[],[AC_MSG_ERROR(You need the boost iostreams library.)])
AC_CHECK_LIB([boost_serialization], [main],[], [AC_MSG_ERROR(You need the boost serialization library.)])
AC_CHECK_LIB([boost_thread], [main],[], [AC_MSG_ERROR(You need the boost thread library.)])
AC_CHECK_LIB([boost_timer], [main],[], [AC_MSG_ERROR(You need the boost timer library.)])
AC_CHECK_LIB([boost_system], [main],[], [AC_MSG_ERROR(You need the boost system library.)])
AC_CHECK_LIB([itpp], [main],[], [AC_MSG_ERROR(You need the itpp library.)])
# Checks for header files.
AC_CHECK_HEADERS([stdlib.h],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([boost/archive/binary_iarchive.hpp],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([boost/archive/binary_oarchive.hpp],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([boost/iostreams/filter/gzip.hpp],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([boost/iostreams/filtering_stream.hpp],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([boost/thread.hpp],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([boost/timer/timer.hpp],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([fstream],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([getopt.h],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([iomanip],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([iostream],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([itpp/itbase.h],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([itpp/itstat.h],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([stdio.h],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([stdlib.h],[],[AC_MSG_ERROR(File needed)])
AC_CHECK_HEADERS([time.h],[],[AC_MSG_ERROR(File needed)])
# Checks for typedefs, structures, and compiler characteristics.
AC_HEADER_STDBOOL
AC_C_INLINE
# Checks for library functions.
AC_CHECK_FUNCS([pow sqrt])
#AC_CONFIG_FILES([Makefile
# man/Makefile
# src/Makefile])
AC_OUTPUT([Makefile src/Makefile man/Makefile tools/Makefile])
man_MANS = wmixnet.1
#!/bin/sh
pod2man --center "" --release "wmixnet-xxxVERSIONxxx" man/wmixnet.pod man/wmixnet.1
=head1 NAME
wmixnet - Variationnel E-M estimation on Stochastic Block Model
=head1 SYNOPSIS
wmixnet -m <modelname> [-i <input_filename> [-N <nb_of_nodes>]] [-f <state_filename>] [-s] [-Q <maximal_number_of_groups> | -a [-e <value>]] [-S <smoothing_mode>] [-o <output_filename>] [-F <output_format>] [-P | -p] [-n <number_of_threads>] [--tolP <value>] [--tolF <value>] [--tolEM <value>] [--ortho] [--penalty <value>] [--version]
=head1 DESCRIPTION
wmixnet fit model given in argument on data
=head1 NOTATIONS
B<n> Number of nodes
B<Q> Number of groups
B<theta> Parameters of the model
B<X> Weighted adjacency matrix
B<Y> Matrix of vectors of covariables
=head1 OPTIONS
=head2 -m <modelname>, --model-name <modelname>
Use model modelname.
modelname can be :
=head3 bernoulli
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ B(pi_{ql})
=head3 BH
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ B(pi_{ql}/(1+exp(-beta*Y_{ij}))
=head3 BI
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ B(pi_{ql}/(1+exp(-beta_{ql}*Y_{ij}))
=head3 poisson
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ Poisson(lambda_{ql})
=head3 gaussian
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ Gaussian(mu_{ql}, sigma2)
=head3 PRMH
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ Poisson(lambda_{ql}exp(beta*Y_{ij}))
=head3 PRMI
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ Poisson(lambda_{ql}exp(beta_{ql}*Y_{ij}))
=head3 GRMH
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ Gaussian(mu_{ql} + beta*Y_{i,j}, sigma2)
=head3 GRMI
X_{ij}|(Z_{iq}=1&&Z_{jl}=1) ~ Gaussian(mu_{ql} + beta_{ql}*Y_{i,j}, sigma2)
=head2 -s, --symmetric
Load symetrics matrix X and Y
=head2 -i <input_filename>, --input-file <input_filename>
Specify the input file. This argument is ignored if the state file exists
and the program load previously loaded results.
Each row describe a edge. Two first columns are nodes id. Nodes id
must be numeric and >0. Third column contains the weigh of node. The next
columns contains covariables. Each edge must have same numbers of
covariables.
Any edges, not secified by row is set to zero, without covariables.
If covariables are used, each edge must be specified.
GNU Ovtave/Matlab, and GNU R functions are provided to write spm files from
matrices. See TOOLS.
=head2 -N <nb_of_nodes>, --number-of-nodes <nb_of_nodes>
Specify the number of nodes. If this option is not provided, the number of
nodes is automatically detected from the input file. This option is usefull
only for very large graphs, to pre-allocate memory in the loading input file
state.
=head2 -f <state_filename>, --state-file <state_filename>
Specify the state file. This file is used to restart program without compute
previously computed results. By default its value is input_name.state
=head2 -o <output_file_name>, --output <output_filename>
Specify the output filename. See --output-format for output format.
If output_filename is not specified, output file is not created
=head2 -F <output_format>, --output-format <output_format>
Specify the output-format.
=head3 Output format description
Avalaible output-format are :
=head4 text
This is the default output format.
Human readble file.
=head4 octave, matlab
GNU Octave and Matlab script file.
This script file create a struct which contains value, and cells indiced by
the "number of groups".
=head4 R
GNU R script file
This script file create a list which contains value, and lists indiced by
the "number of groups".
=head3 fields description
The output format contains fiels
=head4 Qmax
See --Qmax
=head4 model
See --model
=head4 tolP
See --tolP
=head4 tolF
See --tolF
=head4 tolEM
See --tolEM
=head4 J
Values of pseudo-likehood of model + entropy of variatonnal approximation
for each Q
=head4 ICL
Values of ICL criterion for each Q
=head4 alphas
Vector of classes proportion for each Q
=head4 thetas
Set of parameters (named theta) of model for each Q
For bernoulli, theta contains the matrix of pi_{ql}
For BH, theta contains the matrix of pi_{ql} and the matrix beta
For BI, theta contains the matrix of pi_{ql}, the matrix of first components
of beta ... the matrix of last component of beta
For poisson, theta contains the matrix of lambda_{ql}
For PRMH, theta contains the matrix of lambda_{ql} and the matrix beta
For PRMI, theta contains the matrix of lambda_{ql}, the matrix of first
components of beta ... the matrix of last component of beta
For gaussian, theta contains the matrix of mu_{ql} and sigma2
For GRMH, theta contains the matrix of mu_{ql}, the matrix beta
and sigma2
For GRMI, theta contains the matrix of mu_{ql}, the matrix of first
components of beta ... the matrix of last component of beta and sigma2
=head2 -Q <integer>, --Qmax <integer>
Specify the maximal number of class. This is set to minimal number of class
if only minimal number of class is specified. This argument have no effects
in automatic search mode (-a).
=head2 -a, --Qauto, --Qautomatic
Automatically find the number of class by increasing Qmax, starting from
Qmax=4
=head2 -e, --exploration
In automatic mode (-a), Qmax is incresed to exploration * Qoptimal. By
default exploration = 1.5
=head2 -S <smoothing_mode>, --smoothing_mode <smoothing_mode>
Select the smoothing mode. Restart the E-M algorithm with a starting point
obtaned by spliting of merging classes.
=head3 none
No smoothing is applied (default)
=head3 minimal
Smoothing is applied only on Q where J is not increasing function of Q, or
where ICL is not a convex function of Q. This mode is a good compromise.
=head3 exhaustive
Smoothing is applied for all Q.
=head2 --tolP <value>
This is tolP value for ceiling some parameters. By default this is set to
1e-4
=head2 --tolF <value>
This is value for stopping rule of pseudo E-step, where expectation is done
by a fixed-point algorithm. By default this is set to 1e-1.
=head2 --tolEM <value>
This is value for stopping rule of E-M algorithm. By default this value is
set to 1e-5.
=head2 --penalty <value>
This option is usefull for PRMH model. The maximization is done on the
penalized log-liklihood by penality * norm-2^2 of beta (the influence
parameter of covariates). See --ortho.
=head2 --ortho
This option orthormalize covariates before run the estimation. Estimates
must be interpreted throught the transformation (provided in output).
Usefull when a penality is applied.
=head2 -p, --parrallel
Enable parrallel mode. E-M are done in parrallel thread. The parrallel mode
is enabled by default, this may change in the future.
=head2 -P, --no-parrallel
Disable parrallel mode. See --parrallel.
=head2 -n <integer>, --threads <integer>
Set the number of threads to be executed in parrallel whe parrallel mode is
enabled. By default this value is set to the number of computation units of
the system. Most users do not need changing this value.
=head2 -V, --version
Print version of wmixnet on stderr.
=head1 TOOLS
There is tools to generate spm files, these file are located in
documentation directory. PREFIX/share/doc/wmixnet, where PREFIX is your
installation prefix.
Unlike the wmixnet program, these file are provided under a permissive
open-source licence, the BSD-2 licence.
=head2 GNU Octave / Matlab
There is a function create_spm(). The file is named create_spm.m
=head2 GNU R
There is a function create_spm(). The file is named create_spm.R
=head1 BUGS
=head2 Number of covariables
If all edges have not the same number of covariables the program abort()
without saying what.
=head1 AUTHORS
Jean-Benoist Leger <jleger@agroparistech.fr>
=head1 BIBLIOGRAPHY
Mariadassou, M. and Robin, S. and Vacher, C., Uncovering latent structure in
valued graphs: a variational approach, The Annals of Applied Statistics,
Volume 4, Number 2 (2010), 715-742.
# what flags you want to pass to the C compiler & linker
CFLAGS = --pedantic -Wall -std=c99 -O2
AM_LDFLAGS = -lboost_thread -lboost_serialization -lboost_iostreams -litpp -lboost_timer -lboost_system
# this lists the binaries to produce, the (non-PHONY, binary) targets in
# the previous manual Makefile
bin_PROGRAMS = wmixnet wmixnet_export_init wmixnet_estim_with_init
wmixnet_SOURCES = automatic.cc em.cc e_step.cc f_bernoulli.cc f_BI.cc f_BH.cc f_gaussian.cc f_GRMH.cc f_GRMI.cc f_poisson.cc f_PRMH.cc f_PRMI.cc itpp_serialize.h main.cc main.h petites_surcharges.cc petites_surcharges.h save.cc smoothing.cc state.cc symetriser.cc tools.cc wmixnet.cc wmixnet.h spectral_clustering.cc do_first.cc construire_init.cc ortho.cc config.h config_undef.h
wmixnet_export_init_SOURCES = automatic.cc em.cc e_step.cc f_bernoulli.cc f_BI.cc f_BH.cc f_gaussian.cc f_GRMH.cc f_GRMI.cc f_poisson.cc f_PRMH.cc f_PRMI.cc itpp_serialize.h petites_surcharges.cc petites_surcharges.h save.cc smoothing.cc state.cc symetriser.cc tools.cc wmixnet.cc wmixnet.h spectral_clustering.cc do_first.cc construire_init.cc ortho.cc config.h config_undef.h main.h export_init.cc
wmixnet_estim_with_init_SOURCES = automatic.cc em.cc e_step.cc f_bernoulli.cc f_BI.cc f_BH.cc f_gaussian.cc f_GRMH.cc f_GRMI.cc f_poisson.cc f_PRMH.cc f_PRMI.cc itpp_serialize.h petites_surcharges.cc petites_surcharges.h save.cc smoothing.cc state.cc symetriser.cc tools.cc wmixnet.cc wmixnet.h spectral_clustering.cc do_first.cc construire_init.cc ortho.cc config.h config_undef.h main.h estim_with_init.cc
// Copyright (C) 2012, Jean-Benoist Leger
// <jleger@agroparistech.fr>, INRA, France
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "wmixnet.h"
void wmixnet::automatic()
{
if(smoothing_mode=="none")
{
std::cerr << "You running automatic mode without smoothing, you must carrefully analalyze results" << std::endl;
}
if(Qmax<4 || Qmin<=0)
set_QminQmax(1,4);
while(true)
{
std::cerr << "Automatic procedure for " << Qmin << " <= Q <= " << Qmax << std::endl;
em_with_init();
smoothing();
int Qopt = itpp::max_index(v_ICL)+Qmin;
std::cerr << "Current maximum is for Q = " << Qopt << std::endl;
int Qmaxopti = (exploration*Qopt+.5<m_X.rows()) ? (exploration*Qopt+.5) : m_X.rows();
if(Qmax>=Qmaxopti)
break;
int oldQmax=Qmax;
set_QminQmax(1,Qmaxopti);
save_state();
}
}
/* src/config.h.in. Generated from configure.ac by autoheader. */
/* Define to 1 if you have the <boost/archive/binary_iarchive.hpp> header
file. */
#undef HAVE_BOOST_ARCHIVE_BINARY_IARCHIVE_HPP
/* Define to 1 if you have the <boost/archive/binary_oarchive.hpp> header
file. */
#undef HAVE_BOOST_ARCHIVE_BINARY_OARCHIVE_HPP
/* Define to 1 if you have the <boost/iostreams/filtering_stream.hpp> header
file. */
#undef HAVE_BOOST_IOSTREAMS_FILTERING_STREAM_HPP
/* Define to 1 if you have the <boost/iostreams/filter/gzip.hpp> header file.
*/
#undef HAVE_BOOST_IOSTREAMS_FILTER_GZIP_HPP
/* Define to 1 if you have the <boost/thread.hpp> header file. */
#undef HAVE_BOOST_THREAD_HPP
/* Define to 1 if you have the <boost/timer/timer.hpp> header file. */
#undef HAVE_BOOST_TIMER_TIMER_HPP
/* Define to 1 if you have the <fstream> header file. */
#undef HAVE_FSTREAM
/* Define to 1 if you have the <getopt.h> header file. */
#undef HAVE_GETOPT_H
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the <iomanip> header file. */
#undef HAVE_IOMANIP
/* Define to 1 if you have the <iostream> header file. */
#undef HAVE_IOSTREAM
/* Define to 1 if you have the <itpp/itbase.h> header file. */
#undef HAVE_ITPP_ITBASE_H
/* Define to 1 if you have the <itpp/itstat.h> header file. */
#undef HAVE_ITPP_ITSTAT_H
/* Define to 1 if you have the `boost_iostreams' library (-lboost_iostreams).
*/
#undef HAVE_LIBBOOST_IOSTREAMS
/* Define to 1 if you have the `boost_serialization' library
(-lboost_serialization). */
#undef HAVE_LIBBOOST_SERIALIZATION
/* Define to 1 if you have the `boost_system' library (-lboost_system). */
#undef HAVE_LIBBOOST_SYSTEM
/* Define to 1 if you have the `boost_thread' library (-lboost_thread). */
#undef HAVE_LIBBOOST_THREAD
/* Define to 1 if you have the `boost_timer' library (-lboost_timer). */
#undef HAVE_LIBBOOST_TIMER
/* Define to 1 if you have the `itpp' library (-litpp). */
#undef HAVE_LIBITPP
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the `pow' function. */
#undef HAVE_POW
/* Define to 1 if you have the `sqrt' function. */
#undef HAVE_SQRT
/* Define to 1 if stdbool.h conforms to C99. */
#undef HAVE_STDBOOL_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdio.h> header file. */
#undef HAVE_STDIO_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <time.h> header file. */
#undef HAVE_TIME_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to 1 if the system has the type `_Bool'. */
#undef HAVE__BOOL
/* Name of package */
#undef PACKAGE
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Define to 1 if you have the ANSI C header files. */