/* simmatch.cpp CDocTemplate
//
// A program used to check for random matches between simulated
// languages.
// Copyright 1999 by Mark Rosenfelder (markrose@zompist.com)
// May be freely used, reproduced, or modified.
//
// For the logic underlying this model, see my paper
// "How likely are chance resemblances between languages?"
// at http://www.zompist.com/chance.htm
//
// The program allows the testing the model by creating two
// randomly generated languages, and checking for random matches
// between them.
//
// The user can determine the phonological complexity of the
// languages (which indirectly determines the lexicon size), and
// can specify the phonetic and semantic leeway allowed for a match.
//
// The program was written on a Windows system, but I've run it
// with no problems on Unix.
*/
#include "stdio.h"
#include "stdlib.h"
#include "malloc.h"
#include "stdarg.h"
#include "string.h"
#include "time.h"
/******* Simulation parameters *********/
#define nV 5 /* number of vowels */
#define nC 14 /* number of consonants */
#define phonLee 3 /* phonetic leeway */
#define semLee 10 /* semantic leeway */
char form[] = "CVC"; /* phonological structure */
/******* Global variables *********/
int nLex; /* number of words */
int nForm; /* number of letters in form[] */
int *fac; /* used by PosIndex() for w -> form map */
/* Strings for interpreting 'words'.
// Adjacent letters represent similar sounds.
// The program doesn't know anything about phonetics;
// its words are just sequences of numbered phonemes,
// with similarity being represented by similar numbers.
// These strings are for representing them to the user.
//
// Add to these if nC or nV is bigger than these strings.
*/
char *vowel = "aeiuoOUIEA";
char *conso = "ptkgdbvzsTDGxflrwmnNy";
/******* Operational parameters *********/
/* Output words as strings not numbers */
#define SHOW_AS_CHAR
/* Print just matches */
#define PRINT_ONLY_MATCHES
/* Print the entire lexicon */
/* #define PRINT_LANGUAGES */
#define TRUE 1
#define FALSE 0
/******* Prototypes *********/
int *MakeLanguage( char *name );
void PrintWord( int w, int mng );
int Compare( char *aname, int *a, char *bname, int *b );
int Match( int p, int q, int n, int dist );
int PosIndex( int w, int pos );
/******* main routine *********/
int main(int argc, char* argv[])
{
/* Declare variables here because (snif) this isn't C++ */
int i;
int *a, *b;
/* Seed the random number generator */
srand( (unsigned) time( NULL ));
/* Figure out the number of variations per digit */
nLex = 1;
nForm = strlen(form);
fac = (int *) calloc( nForm, sizeof(int) );
for (i = nForm - 1; i >= 0; i--)
{
int n = form[i] == 'V' ? nV : nC;
if (i == nForm - 1)
fac[i] = n;
else
fac[i] = n * fac[i+1];
}
/* # words is the number of variations from the 1st digit on */
nLex = fac[0];
/* Create our two languages */
a = MakeLanguage( "A" );
b = MakeLanguage( "B" );
/* Count the matches */
Compare( "A", a, "B", b );
/* Cleanup */
free(a);
free(b);
free(fac);
printf( "\nPress return to continue\n" );
getchar();
return 1;
}
/******* MakeLanguage *********
//
// Create a fake language according to the current parameters.
//
// A language conceptually consists of a set of nLex words, each
// with a separate meaning. Meanings are represented simply by
// numbers from 0 to nLex, randomly distributed among the words.
// An array of these meanings is returned by this function.
//
// The words-- their phonetic shapes-- are not actually stored,
// but re-created on demand. We simply assume that there's a word
// for every possible phonetic shape, listed in a standard order.
//
// The possible shapes are given by form[], and the order is given
// by the vowel[] and conso[] arrays.
//
// Then we simply need some rules for figuring out the actual
// form for word i. These are given by the PosIndex routine.
*/
int *MakeLanguage( char *name )
{
/* Allocate space for the language */
int *lg = (int *) calloc( nLex, sizeof(int) );
int w;
/* To allocate the nLex meanings automatically, we keep a list
// of unused meanings and use them randomly, one at a time.
// This list of course starts out with all nLex possibilities. */
int *meanings = (int *) calloc(nLex, sizeof(int));
int meanLeft = nLex;
for (w = 0; w < nLex; w++)
meanings[w] = w;
/* Now generate nLex words */
for (w = 0; w < nLex; w++)
{
/* If there's only one meaning left, just grab it */
if (meanLeft == 1)
lg[w] = meanings[0];
/* Otherwise, pick randomly from the remaining meanings */
else
{
int whichMean = rand() % meanLeft;
lg[w] = meanings[whichMean];
/* The selected meaning is replaced with the last
// meaning in the list, which thus shrinks by 1. */
meanings[whichMean] = meanings[meanLeft - 1];
meanLeft--;
}
}
free( meanings );
#ifdef PRINT_LANGUAGES
printf( "\nLanguage %s\n", name );
for (w = 0; w < nLex; w++)
{
PrintWord( w, lg[w] );
printf( " " );
}
printf( "\n" );
#endif
return lg;
}
/******* PosIndex *********
//
// Given word w, this word will generate its phonetic shape.
//
// It should be called one character at a time, from 0 to nForm - 1.
// It returns an integer representing which vowel or consonant
// word w has.
//
// Suppose form[] is CVC. As we go down the list of words, the
// last character (nForm - 1) just increments from 0 to nC
// over and over again-- so it's always w mod nC.
//
// The character before it starts out at 0 for the first cycle,
// 1 for the next, and so on-- changing every nC words.
//
// The character before *that* does the same, but changes only as
// the following characters finish their entire cycle-- nC * nV.
//
// So the general formula for all but the last character is that
// we divide by the size of the following characters' cycle.
// We figured that out in main() and stored it in fac[].
*/
int PosIndex( int w, int pos )
{
int n = form[pos] == 'C' ? nC : nV;
if (pos == nForm - 1)
return w % n;
else
return (w / fac[ pos + 1 ]) % n;
}
/******* PrintWord *********
//
// A routine to print out the phonetic shape of word w, as well
// as its meaning. It uses PosIndex to figure out the phonetics.
*/
void PrintWord( int w, int mng )
{
int pos;
for (pos = 0; pos < nForm; pos++)
{
int ix = PosIndex( w, pos );
char c = form[pos] == 'C' ? conso[ix] : vowel[ix] ;
# ifdef SHOW_AS_CHAR
printf( "%c", c );
# else
printf( "%i", ix );
# endif
}
if (nForm > 2)
printf( "." );
printf( "%2i", mng );
}
/******* Match *********
//
// Determine whether two phonetic or semantic values match,
// to within the specified leeway.
//
// (Remember that both sounds and meanings are represented, in this
// simulation, by simple integers.)
//
// Value q is considered to match value p if it falls within the
// range p - dist/2 ... p + dist/2. (We adjust the bottom range
// upward one if the leeway is even.)
//
// The values OVERLAP at the ends. Otherwise, there would be less
// matches than expected values near the ends of their range.
//
// Obviously a more sophisticated calculation could be used-- e.g.
// use a multi-dimensional model of phonological space. But
// a model where we just specify how many sounds each sound can
// match is a good approximation to this.
*/
int Match( int p, int q, int n, int dist )
{
/* Fast check for exact matches */
if (p == q)
return TRUE;
else
{
int i;
int d2 = dist / 2;
int odd = dist % 2 == 1;
/* Check each value in the target range */
for (i = -d2 + (odd ? 0 : 1); i <= d2; i++)
{
int comparand = q + i;
/* Make the ends of the range of values overlap */
if (comparand >= n)
comparand -= n;
else if (comparand < 0)
comparand += n;
if (p == comparand)
return TRUE;
}
}
return FALSE;
}
/******* Compare *********
//
// Compare two languages, reporting the number of matches.
//
// Two numbers are reported:
//
// * the "raw" number of matches, corresponding to the 'naive'
// match probability of phonLee * semLee. There can be more than
// one match in B for each word in A, this way.
//
// * a number restricted to one per word, corresponding to the
// corrected probability 1 - ((1 - p)^m) discussed later in the paper.
// Here, there is no more than one match per word in A.
//
// This match routine corresponds to the methodology of checking for
// a match in B for each word in language A.
//
// This can lead to using the same word in B more than once. This
// doesn't bother some of the people who amass apparent coincidences,
// and if we're testing their claims we must use the same methodology.
//
// So, we return two sets of results, allowing and disallowing
// the use of the same word in B more than once.
*/
int Compare( char *aname, int *a, char *bname, int *b )
{
int w, v, i;
int nMatch = 0;
int nTMatch = 0;
int nMatch1 = 0;
int nTMatch1 = 0;
/* Arrays used to store phonetic realizations of words */
int *apos = (int *) calloc( nForm, sizeof(int) );
int *bpos = (int *) calloc( nForm, sizeof(int) );
/* Array that remembers words in B we already used */
int *used = (int *) calloc( nLex, sizeof(int) );
for (v = 0; v < nLex; v++)
used[v] = FALSE;
printf( "Comparing languages %s and %s\n", aname, bname );
printf( "Format: word-in-A > phonetic-matches-in-B\n" );
printf( "Semantic matches indicated by +\n" );
printf( "Words in B already used marked by X\n" );
/* For each word in A... */
for (w = 0; w < nLex; w++)
{
int printedA = FALSE;
int subMatch = 0;
int subMatch1 = 0;
#ifndef PRINT_ONLY_MATCHES
printf( "\n" );
PrintWord( w, a[w] );
printf( " > " );
printedA = TRUE;
#endif
/* Find its phonetic form (as an integer array) */
for (i = 0; i < nForm; i++)
apos[i] = PosIndex( w, i );
/* Check it against each word in B */
for (v = 0; v < nLex; v++)
{
int cont = TRUE;
/* Check the characters for a phonetic match, 1 at a time */
for (i = 0; cont && i < nForm; i++)
{
int n = form[i] == 'C' ? nC : nV;
bpos[i] = PosIndex( v, i );
if (!Match( apos[i], bpos[i], n, phonLee ))
cont = FALSE;
}
/* Did all the characters match (within the leeway)? */
if (cont)
{
/* Yes, check for a semantic match. */
int matches = Match( a[w], b[v], nLex, semLee);
/* Increment #'s of matches
// There's a separate count of only-use-B-once matches. */
if (matches)
{
if (!used[v])
subMatch1++;
subMatch++;
}
/* Report all phonetic matches, or just semantic ones */
#ifdef PRINT_ONLY_MATCHES
if (matches)
#endif
{
if (!printedA)
{
printf( "\n" );
PrintWord( w, a[w] );
printf( " > " );
printedA = TRUE;
}
PrintWord( v, b[v] );
printf( "+" );
if (used[v])
printf( "X" );
printf( " " );
}
/* Mark this word used
// We simple-mindedly keep this as an unsorted list */
if (matches)
used[v] = TRUE;
}
}
/*We're done with word w. Increment our subtotals */
nMatch += subMatch ? 1 : 0;
nTMatch += subMatch;
nMatch1 += subMatch1 ? 1 : 0;
nTMatch1 += subMatch1;
}
/* Report the results */
printf( "\n\nMatches in %i words:\n", nLex );
printf( "A matches B matches allowed: \n" );
printf( "allowed: >1 per word 1 per word\n" );
printf( "----------- ----------- ----------\n" );
printf( ">1 per word %8i %8i \n", nTMatch, nTMatch1 );
printf( " 1 per word %8i %8i \n", nMatch, nMatch1 );
/* Cleanup */
free(apos);
free(bpos);
free(used);
return nMatch;
}