\documentclass[10pt]{article}
\usepackage{times}
\usepackage{epsfig}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{setspace}
\doublespacing
\setlength{\textheight}{8.875in}
\setlength{\textwidth}{6.875in}
\setlength{\columnsep}{0.3125in}
\setlength{\topmargin}{0in}
\setlength{\headheight}{0in}
\setlength{\headsep}{0in}
\setlength{\parindent}{1pc}
\setlength{\oddsidemargin}{-.1875in}  % Centers text.
\setlength{\evensidemargin}{-.1875in}

\newcommand{\Section}[1]{\vspace{-8pt}\section{\hskip -1em.~~#1}\vspace{-3pt}}
\newcommand{\SubSection}[1]{\vspace{-3pt}\subsection{\hskip -1em.~~#1}
        \vspace{-3pt}}
\newtheorem{thm}{Theorem}
\newtheorem{lem}{Lemma}
\newtheorem{prop}{Proposition}
\newtheorem{coro}{Corollary}
\newcommand{\bprf}{\par{}\smallskip\noindent{\bf Proof}:\ }
\newcommand{\eprf}{\hfill\rule{.2cm}{.2cm}}
\newcommand{\Z}{\mathbb{Z}}
\begin{document}
% Don't want date printed
\date{}
% Make title bold and 14 apt font (Latex default is non-bold, 16apt)
\title{\bf The Symbolic Goulden Jackson Cluster Method}
% For single author (just remove % characters)
% For two authors (default example)
\author{
%\begin{tabular}[t]{c@{\extracolsep{8em}}c}
Xiangdong Wen
%\end{tabular}
}
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\Section{Abstract} \label{sec-intro}
Let $V$ and $B$ be a finite alphabet and a finite set of $words$ (on $V$)
respectively. 
Suppose  $a(n)$ is the total number of words with length $n$ that avoid 
the words in $B$ as factors. The aim is to
find the generating function 
\begin{equation}
f(s)=\sum_{n=0}^{\infty}a(n)s^n
\label{gene}
\end{equation} 
in an efficient way.
\bigskip \\
The Goulden-Jackson cluster method(\cite{GoJ1},\cite{GoJ2}), which is widely used in solving
this kind of problems, has
been beautifully explained,
extended, and implemented by J. Noonan and 
D. Zeilberger(\cite{NZ}). 
However, their Maple packages require that
the cardinality of the alphabet is a {\it numeric} argument
rather than 
{\it symbolic}. In this paper we extended the method into the latter
case, thereby initiating the
{\it Symbolic Goulden-Jackson Method}.
\bigskip\\
{\bf Keywords:}\\
Goulden-Jackson cluster method, marked word, cluster, self-avoiding walks.
\Section{Review of the Goulden Jackson Cluster method}\label{sec-gj}
Recall that a {\bf factor} of the word $w_1w_2\cdots w_n$ is one of the words
$w_iw_{i+1}\cdots w_{j-1} w_j$, $1\leq i \leq j \leq n$ that we shall
denote by $[i,j]$. 
Two factors $[i,j]$ and $[i',j']$ {\bf overlap} if they have at least 
one common letter.
\bigskip \\
The {\bf length} of the word $w=w_1w_2\cdots w_n$ is $|w|=n;$
and the {\bf weight}
of the word $w$ is $weight(w)=s^{|w|}=s^n$. Obviously, 
the generating function ~(\ref{gene}) is the same as
$$ f(s)=\sum_{w \in {\cal L}(B) }weight(w),$$ 
where ${\cal L}(B)$ is the set of all words over $V$ that avoid the members
of $B$ as factors. 
\bigskip \\
A word with some factors marked is called a {\bf marked word}. 
Here we only consider the case when the marked
factors are the words in $B$. A 
marked word could be written in the following form: 
$$
(w;[i_1,j_1],[i_2,j_2],\cdots,[i_l,j_l]), \mbox{ where }
[i_r,j_r]\, , 1\leq r\leq l \mbox{ are marked factors.}
$$
For example, let $ V=\{1,2,3\} $, $ B=\{123,231,312\} $ and $w=12312.$ 
There are totally $2^3$ marked words for $w$:
\begin{equation}\nonumber
\begin{array}{llll} 
(12312;),& (12312;[1,3]),& (12312;[2,4]),& (12312;[3,5]),\\ 
(12312;[1,3],[2,4]),& (12312;[1,3],[3,5]),& 
(12312,[2,4],[3,5]), & (12312;[1,3],[2,4],[3,5]).\\
\end{array}
\end{equation}
\bigskip \\
Define the ${\bf \overline{weight}}$ of a marked word 
$w$ with marked factors $S$, $S \subset B$ as 
$$
\overline{weight}(w,S)=(-1)^{|S|} s^{|w|},
$$
where $|S|$ is the cardinality of $S$.
\bigskip \\
Let $V^*$ be the set of all words generated by $V$; and let $B(w)$
be a subset of $B$ whose members are also factors of $w$. We have
\begin{thm}: 
\begin{equation}f(s)=\sum_{w \in {\cal L}(B) } weight(w)=\sum_{w\in V^*} 
\sum_{S \subset B(w)} \overline{weight}(w,S).
\label{mark}
\end{equation}
\end{thm}
\bprf
The basic idea in the proof is to use the inclusion-exclusion principle.
Let $N_B(w)$ denote the number of marked factors of $w$ that belong to $B$. 
Then,
\begin{displaymath}
\begin{array}{rcl}
f(s) &=& \sum_{w \in {\cal L}(B)} weight(w)\\
&=&\sum_{w \in V^*} weight(w)0^{N_B(w)}\\
&=&\sum_{w \in V^*} s^{|w|}[1+(-1)]^{N_B(w)}\\
&=&\sum_{w \in V^*} s^{|w|} \sum_{S \subset B(w)} (-1)^{|S|} \\
&=&\sum_{w \in V^*} \sum_{S \subset B(w)} (-1)^{|S|}s^{|w|}\\
&=&\sum_{w \in V^*} \sum_{S \subset B(w)}\overline{weight}(w,S)
\end{array}
\end{displaymath}
\eprf
\bigskip \\
By the theorem, the calculation of the generating function ~(\ref{gene}) 
is then  
transfered to finding the generating function for the weighted  
marked words ~(\ref{mark}) which is much easier to weight-count 
by the Goulden-Jackson cluster method. 
\bigskip \\
A {\bf cluster } is a marked word 
$$(w_1w_2\cdots w_n;[i_1(=1),j_1],[i_2,j_2],\cdots,[i_l,j_l(=n)]),$$
where $[i_k,j_k]$ overlaps with $[i_{k+1},j_{k+1}]$ for all $k=1,2,\cdots,l-1$. 
\bigskip \\
A non-empty marked word either ends with a letter that is not
part of a cluster, or ends with a cluster. Peeling-off the maximal cluster, we
get a shorter marked word. Thus we have the following decomposition
$${\cal M}=\{empty\_word\} \cup {\cal M}V \cup {\cal M}{\cal C}.$$
Taking weights on both sides and solving for $\overline{weight}({\cal M})$, we have
\begin{equation}
f(s)= \overline{weight}({\cal M}) 
=\frac{1}{1-ds-\overline{weight}({\cal C})}\quad .
\label{main}
\end{equation}
\bigskip \\
The only step left is to find $\overline{weight}({\cal C})$.
\bigskip \\
For a given word $w=w_1w_2\cdots w_n$, let 
$HEAD(w)$ be the set of all proper prefixes:
 $$HEAD(w)\, := \,\{w_1w_2\cdots w_k|k=1,2,\cdots,n-1\},$$
and $TAIL(w)$ be the set of all proper suffixes 
$$
TAIL(w)\, := \, \{w_kw_{k+1}\cdots w_n|k=2,3,\cdots,n\},
$$  
and let
$$
OVERLAP(u,v)\,:= \,TAIL(u)\cap HEAD(v).
$$ 
Let $u/v$ denote the operation of the
word $u$  chopping off its head $v$. For example: $12321/12=321.$ 
Let
$$
(u:v)\, := \, \sum_{x\in OVERLAP(u,v)}\overline{weight}(v/x).
$$
\bigskip \\
The set of clusters ${\cal C}$ can be partitioned into
$$
{\cal C}= \bigcup_{v \in B} {\cal C}[v] \, ,$$
 where ${\cal C}[v],$  $ v \in B$ is the set
of clusters whose last entry is $v$.
\bigskip \\
Given a cluster in ${\cal C}[v]$, $ v \in B$ it either consists of 
just $v$  or
chopping $v$ results in a shorter cluster in ${\cal C}[u]$, $ u \in B$ if
$OVERLAP(u,v)$ is not empty. On the other hand, given a cluster in ${\cal C }[u]$,
we could always reconstitute the bigger cluster in ${\cal C}[v]$ 
by adding some words in $OVERLAP(u,v).$  
Hence, there exists a bijection:
$$ 
{\cal C}[v] \leftrightarrow \{(v;[1,|v|])\}\bigcup_{u\in B} {\cal C}[u] OVERLAP(u,v).
$$
Taking weights on both sides, we have:
\begin{equation}
\overline{weight}({\cal C}[v])=(-1)\,\overline{weight}(v)-\sum_{u \in
  B}\,(u:v)\,\overline{weight}({\cal C}[u]).
\label{mainsys}
\end{equation}
This is a system of $|B|$ linear equations with $|B|$ unknowns 
$\overline{weight}({\cal C}[v]), v\in B.$
\bigskip \\ 
After solving these equations we could simply obtain 
$\overline{weight}({\cal C})$ by:
$\overline{weight}({\cal C})=\sum_{v\in B}\,\overline{weight}({\cal C}[v])$ 
\bigskip \\
Because $\overline{weight}({\cal C})$ is independent of
the cardinality of the alphabet, 
the symbolic Goulden Jackson could be easily implemented. 

\section{Symmetric Cases} 
Given an alphabet $V=\{1,2,3\}$, let us find the generating 
function for the number of words which do not 
have three consecutive different letters as factors, 
i.e. $B=\{123,132,213,231,321\}$.
\bigskip \\
By the original Goulden-Jackson cluster method, 
we need to set up and solve a system of $|B|=6$ 
linear equations with six unknowns $\overline{weight}({\cal C}[v])$, 
$v\in B$ : 
\begin{equation}\nonumber
 \left\{
\begin{array}{lcl} 
\overline{weight}({\cal C}[123]) & = & -s^3-s^2\,\overline{weight}({\cal C}[312])
-s^2\,\overline{weight}({\cal C}[321])-s\,\overline{weight}({\cal C}[231]) \\
\overline{weight}({\cal C}[132]) & = & -s^3-s^2\,\overline{weight}({\cal C}[231])
-s^2\,\overline{weight}({\cal C}[213])-s\,\overline{weight}({\cal C}[321]) \\
\overline{weight}({\cal C}[213]) & = & -s^3-s^2\,\overline{weight}({\cal C}[312])
-s^2\,\overline{weight}({\cal C}[321])-s\,\overline{weight}({\cal C}[132]) \\
\overline{weight}({\cal C}[231]) & = & -s^3-s^2\,\overline{weight}({\cal C}[123])
-s^2\,\overline{weight}({\cal C}[132])-s\,\overline{weight}({\cal C}[312]) \\
\overline{weight}({\cal C}[312]) & = & -s^3-s^2\,\overline{weight}({\cal C}[213])
-s^2\,\overline{weight}({\cal C}[231])-s\,\overline{weight}({\cal C}[123]) \\
\overline{weight}({\cal C}[321]) & = & -s^3-s^2\,\overline{weight}({\cal C}[123])
-s^2\,\overline{weight}({\cal C}[132])-s\,\overline{weight}({\cal C}[213]) \\
\end{array} \right. . 
\end{equation}
By the symmetry of $B$,
all the clusters ${\cal C}[v]$, $v\in B$ have the same generating function
$\overline{weight}({\cal C}[123])$. 
Thus we can reduce these six equations to only one equation: 
$$\overline{weight}({\cal C}[123])=-s^3
-2s^2\, \overline{weight}({\cal C}[123])
-s\, \overline{weight}({\cal C}[123]). $$
After solving it, we have 
$$\overline{weight}({\cal C})=6\,\overline{weight}({\cal
   C}[123])=\frac{-6s^3}{1+2s^2+s} ,$$
and $$f(s)=\frac{1}{1-3s-\frac{-6s^3}{1+2s^2+s}}
=-\frac{2s^2+s+1}{s^2+2s-1}.$$
\bigskip \\
Assuming the cardinality of the alphabet $V$ is a symbol $d$,
$V=\{1,2,3,\cdots d \}$, let us find the generating 
function for the number of words which do not 
have three consecutive different letters as factors, 
i.e. $$B=\{123,124,125,\cdots, d(d-1)(d-3), d(d-1)(d-2)\}.$$ 
By the original Goulden-Jackson cluster method, 
we need to set up and solve a system
of $|B|=d(d-1)(d-2)$ linear equations. Using the symmetry of $B$,
we only need to set up and solve one equation:
$$\overline{weight}({\cal C}[123])=-s^3
-(d-1)(d-2)s^2\, \overline{weight}({\cal C}[123])
-(d-2)s\, \overline{weight}({\cal C}[123]). $$
Thus,
$$
\overline{weight}({\cal C})=d(d-1)(d-2)\,\overline{weight}({\cal C}[123])
=\frac{-d(d-1)(d-2)s^3}{1+(d-1)(d-2)s^2+(d-2)s},
$$
and
$$f(s)=\frac{1}{1-ds-\frac{-d(d-1)(d-2)s^3}{1+(d-1)(d-2)s^2+(d-2)s}}
=\frac{(-d^2+3d-2)s^2+(-d+2)s-1}{(d-2)s^2+2s-1}.$$
\bigskip \\
In general, if the set $B$ is invariant under the action of the
symmetric group, there exists a more efficient way to find the generating
function  ~(\ref{gene}). 
\bigskip \\
Two words $u$, $v$ are {\bf equivalent}, $u \equiv v$ 
if there exists a permutation $\lambda$ such that
$\lambda(u)=v.$ 
By symmetry, all the elements in the equivalence class of $v$ have the
same cluster generating function $\overline{weight}({\cal C}[v])$ .
\bigskip \\
Define the {\bf dimension} of a letter $v$, $dim(v)$  as the number
of different letters appeared in $v$. Then there are
 ${d} \choose {dim(v)}$ different words in the equivalence class of $v$. 
\bigskip \\
Suppose the words set $B$
is partitioned into different
equivalence classes $B_1,B_2,B_3,\cdots,B_k$, 
and $b_1,b_2,b_3,\cdots,b_k$ are 
the representatives respectively. Define 
 $ (b_i:B_j) := \sum_{b \in B_j} (b_i:b).$
Then the system~(\ref{mainsys}) could be reduced to: 
\begin{equation}
\overline{weight}({\cal C}[b_i])=
-\overline{weight}(b_i)-\sum_{j=1}^{k} (b_i:B_j)\overline{weight}({\cal C}[b_j]),\mbox{
  }i=1,\cdots,k.
\label{symequation}
\end{equation}
This is a system of $k$ linear equations with $k$ unknowns
$\overline{weight}({\cal C}[b_i])$, $i=1,2,\cdots, k$. Remember that $k$ is 
the number
of different equivalence classes in $B$. There are many fewer equations and
many fewer unknowns than in the original Goulden-Jackson cluster method, 
and thus everything is much more efficient. After solving the system, 
we could obtain $\overline{weight}({\cal C}) $ by
\begin{equation}
\overline{weight}({\cal C})=
\sum_{i=1}^{k}{d \choose dim(b_i)}\overline{weight}({\cal C}[b_i]).
\label{genecluster}
\end{equation}
\bigskip \\
Given $u=u_1u_2u_3\cdots u_n$,
let $H_i(u):=u_1u_2\cdots u_i$ and
$T_i(u):=u_{n-i+1}u_{n-i+2}\cdots u_{n-1}u_n$, where $0 \le i \le n$. It is easy to obtain
$$ (b_i : B_j) = \sum_{m=1}^{min(|b_i|,|b_j|)-1} I(T_m(b_i) \equiv H_m(b_j)) {d
\choose {dim(b_j)-dim(H_m(b_j))}} s^{|b_j|-m},$$
where 
$$I(T_m(b_i) \equiv H_m(b_j))= \left\{ 
\begin{array}{ll}
1, & \mbox{ if } T_m(b_i) \, \equiv \, H_m(b_j) ,\\
0, & \mbox{ otherwise}.
\end{array} \right.
$$
\bigskip \\
In the two examples below, the first can still be done with
the unextended Goulden-Jackson, since the number of letters is
numeric, $3$, but the second one requires the new extension, since
the number of letters is $d$, i.e. a {\it symbol}.
\bigskip \\
{\bf Example 1}: Let $V=\{1,2,3\}$. Find the generating function 
for the number of words which 
have neither three consecutive different letters nor 
three consecutive same letters as factors, i.e. 
$$B=\{123,132,213,231,312,321,111,222,333\}.$$
The set $B$ is invariant under the symetric group; and 
it can be partitioned into two
equivalence classes: 
$$B_1=\{123,132,213,231,312,321 \},\,\, 
B_2=\{111,222,333  \}.$$ 
By the system~(\ref{symequation}) and the equation~(\ref{genecluster}), 
we have
\begin{equation}
\nonumber
\left\{
\begin{array}{lcl}
\overline{weight}({\cal C}[123]) &=& -s^3-2s^2\,
\overline{weight}({\cal C}[123])-s\,
\overline{weight}({\cal C}[123])-s^2\,\overline{weight}({\cal C}[111])\\
\overline{weight}({\cal C}[111]) &=&-s^3
-2s^2\,\overline{weight}({\cal C}[123])
-s^2\,\overline{weight}({\cal C}[111])-s\,\overline{weight}({\cal C}[111])
\end{array}
\right. ,
\end{equation}
and
$$
\overline{weight}({\cal C})
= 6\,\overline{weight}({\cal C}[321])+3\,\overline{weight}({\cal C}[111]).
$$
Solving the system, finally we get
$$f(s)=\frac{1}{1-3s-\overline{weight}({\cal C}) }=\frac
{1}{1-3s-[6\,\overline{weight}({\cal C}[321])+3\,\overline{weight}({\cal
    C}[111])]}=-\frac{3s^2+s+1}{2s-1}
$$
\bigskip \\
{\bf Example 2:} Let $V=\{1,2,3,\cdots,d\}$.
Find the generating function 
for the number of words which 
have neither three consecutive different letters nor 
three consecutive same letters as factors.
\bigskip \\
By ~(\ref{symequation}) and ~(\ref{genecluster}), we have
\begin{equation}
\nonumber
\left\{
\begin{array}{lcl}
\overline{weight}({\cal C}[123])&=&-s^3
-(d-1)(d-2)s^2\,\overline{weight}({\cal C}[123])
-(d-2)s\,\overline{weight}({\cal C}[123])
-s^2\,\overline{weight}({\cal C}[111])\\
\overline{weight}({\cal C}[111])&=&-s^3
-(d-1)(d-2)s^2\,\overline{weight}({\cal C}[123])
-s^2\,\overline{weight}({\cal C}[111])
-s\,\overline{weight}({\cal C}[111])
\end{array} \right. ,
\end{equation}
and
$$
\overline{weight}({\cal C})
= d(d-1)(d-2)\,\overline{weight}({\cal C}[321])+
d\,\overline{weight}({\cal C}[111]).
$$
Finally, 
$$ f(s)=\frac{1}{1-ds-\overline{weight}({\cal C})}
=\frac{(-d^2+2d)s^3+(-d^2+2d-1)s^2+(1-d)s-1}{(d-1)s^2+s-1}.$$


%%The problem left is how to efficiently get $(b_i:B_j)$.
%%Because $B_j$ is invariant under the action of the symmetric group.
%%let $dim(v)$ be the number of letters used in v, 
%%and there are
%%many fewer equations and many fewer unknowns. The symbolic symetric method
%%could be efficient implemented if carefully calculating (U:V) for
%%equivalence classes.

 
\section{Finite Memory Self-Avoiding Walks} 
The set of so-called
{\it self-avoiding walks} (\cite{MS})could be regarded as a 
set of words over the
alphabet $$V=\{1,-1,2,-2,\cdots, d,-d\},$$ which avoid as factors
the words with as many $i$'s as $-i$'s for {\it each } $i$ between
$1$ and $d$.
In other words, it is a set of words that avoid the `bad factors' in $B=\{[1,-1],[1,2,-1,-2]
,[1,2,3,-1,-2,-3],\cdots \}$ and all their 
images under the action of the group of signed permutations.
J. Noonan (\cite{Noonan}) has a detailed discussion about the finite memory 
self-avoiding walks for the memory up to $8$. We have implemented the
procedures for symmetric cases under signed permutations too. 
Using our maple package we could automatically get the formula of the
generating functions for
 2-step, 4-step and 6-step memory 
self-avoiding walks.
For 8-step memory self-avoiding walks, the package set up a system of 
112 linear equations but our own computer was not big enough to
solve it.
\Section{The Maple package}

All the procedures are included in the package ``SYMBOLIC\_GJ", downloadable 
from the web address: 
$$ http://www.math.temple.edu/\sim wen/gj/SYMBOLIC\_GJ .$$
The main procedures take the cardinality of the alphabet as 
symbolic input. Moreover the package 
could be used to compute generating functions 
for the symmetric cases and for the 
finite memory self avoiding walks.   


\Section{Acknowledgment}

This work will be a part of the author's Ph.D. dissertation, written
under the direction of Professor Doron Zeilberger (Rutgers University).
I wish to thank Dr. Zeilberger for his generous support and encouragement.


\begin{thebibliography}{9}
\small  % Use 9 point text.


\bibitem{NZ}
John Noonan and Doron Zeilberger,
`` The Goulden-Jackson Cluster Method: Extensions, 
Applications, and Implementations'',
{\em J. Difference Eq. Appl.}, 5(1999), 355-377. 

\bibitem{Noonan}
John Noonan,
`` New Upper Bounds for The Connective Constants of Self-Avoiding Walks'',
{\em J. Stat. Phys.}, 91(1998), 871-888. 

\bibitem{MS}
N. Madras,and G. Slade, 
``The Self avoiding Walk'',
{\em } Birkhauser, Boston (1993).

\bibitem{GoJ1} I. Goulden  and D.M. Jackson,
{\em An inversion theorem for cluster decompositions of
sequences with distinguished subsequences},
J. London Math. Soc.(2){\bf 20} (1979), 567-576.
 
\bibitem{GoJ2} I. Goulden  and D.M. Jackson,
{\em "Combinatorial Enumeration"}, John  Wiley, 1983, New York.
 
\end{thebibliography}
  
\noindent
{\bf Xiangdong Wen; 638 Wachman Hall(038-16); 1805 N. Broad Street}\\
{\bf Department of Mathematics; Temple University; Philadelphia,PA 19122 }\\
{\bf wen@math.temple.edu; 215-204-1655;}\\
{\bf AMS Classification codes:05A15}\\
{\bf Electronic version: http://www.math.temple.edu/\~\,wen/gj}\\
\end{document}

% LocalWords:  symetric

