/* ================================ Example 1: Cluster analysis taken from Example 4 of the SAS documentation to PROC CLUSTER ==================================*/ options nocenter nodate pageno=1 linesize=132; title h = 1 j = l 'File: cluster.mammalsteeth.sas'; title2 h = 1 j = l 'Cluster Analysis of Mammals'' teeth data'; data teeth; input mammal $ 1-16 @21 (v1-v8) (1.); label v1='Top incisors' v2='Bottom incisors' v3='Top canines' v4='Bottom canines' v5='Top premolars' v6='Bottom premolars' v7='Top molars' v8='Bottom molars'; cards; BROWN BAT 23113333 MOLE 32103333 SILVER HAIR BAT 23112333 PIGMY BAT 23112233 HOUSE BAT 23111233 RED BAT 13112233 PIKA 21002233 RABBIT 21003233 BEAVER 11002133 GROUNDHOG 11002133 GRAY SQUIRREL 11001133 HOUSE MOUSE 11000033 PORCUPINE 11001133 WOLF 33114423 BEAR 33114423 RACCOON 33114432 MARTEN 33114412 WEASEL 33113312 WOLVERINE 33114412 BADGER 33113312 RIVER OTTER 33114312 SEA OTTER 32113312 JAGUAR 33113211 COUGAR 33113211 FUR SEAL 32114411 SEA LION 32114411 GREY SEAL 32113322 ELEPHANT SEAL 21114411 REINDEER 04103333 ELK 04103333 DEER 04003333 MOOSE 04003333 ; /* principal components analysis of teeth here we score the principal components and output then to data set teeth2 */ proc princomp data=teeth out=teeth2; var v1-v8; run; /* average linkage cluster analysis a dendrogram (tree diagram) is also output */ proc cluster data=teeth2 method=average outtree=ttree ccc pseudo rsquare; var v1-v8; id mammal; run; /* --- PROC TREE prints the tree diagram here we also output a data set, called ttree2 that contains four clusters --- */ proc tree data=ttree out=ttree2 nclusters=4; id mammal; run; /* --- the next set of statements sort the data sets by variable mammal and then merge the tree data set (with the cluster scores) with the teeth data set (with the prinicipal components) --- */ proc sort data=teeth2; by mammal; run; proc sort data=ttree2; by mammal; run; data teeth3; merge teeth2 ttree2; by mammal; run; /* --- stuff for plotting --- */ symbol1 c=black f=, v='1'; symbol2 c=black f=, v='2'; symbol3 c=black f=, v='3'; symbol4 c=black f=, v='4'; proc gplot; plot prin2*prin1=cluster; run; proc sort; by cluster; run; proc print; by cluster; var mammal prin1 prin2; run; /* -------------------------------------------------- * Example 2: Cluster Analysis * Data from Morrison (1990), 3rd edition, p. 288-289 * taken from Jolicoeur (1959, 1975) on measurements on the skull dimensions of the wolf Canis Lupus L. * Variables are: * Location: rm=rocky mountain ar=arctic * Sex: m=male f=female * X1 = palatal length * X2 = postpalatal length * X3 = zygomatic width * X4 = palatal width outside the first upper molars * X5 = palatal width inside the second upper molars * X6 = width between the postglenoid foramina * X7 = interorbital width * X8 = least width of the braincase * X9 = crown length of the first upper molar * ---------------------------------------------------- */ TITLE h=1 j=l 'File: ~carey/p7291dir/cluster1.wolves.sas'; DATA wolves; LENGTH location $2 wolf $5 sex $1; INPUT location $ wolf $ sex $ x1-x9; subject=_n_; LABEL X1 = 'palatal length' X2 = 'postpalatal length' X3 = 'zygomatic width' X4 = 'palatal width-1' X5 = 'palatal width-2' X6 = 'postg foramina width' X7 = 'interorbital width' X8 = 'braincase width' X9 = 'crown length'; cards; rm rmm1 m 126 104 141 81.0 31.8 65.7 50.9 44.0 18.2 rm rmm2 m 128 111 151 80.4 33.8 69.8 52.7 43.2 18.5 rm rmm3 m 126 108 152 85.7 34.7 69.1 49.3 45.6 17.9 rm rmm4 m 125 109 141 83.1 34.0 68.0 48.2 43.8 18.4 rm rmm5 m 126 107 143 81.9 34.0 66.1 49.0 42.4 17.9 rm rmm6 m 128 110 143 80.6 33.0 65.0 46.4 40.2 18.2 rm rmf1 f 116 102 131 76.7 31.5 65.0 45.4 39.0 16.8 rm rmf2 f 120 103 130 75.1 30.2 63.8 44.4 41.1 16.9 rm rmf3 f 116 103 125 74.7 31.6 62.4 41.3 44.2 17.0 ar arm1 m 117 99 134 83.4 34.8 68.0 40.7 37.1 17.2 ar arm2 m 115 100 149 81.0 33.1 66.7 47.2 40.5 17.7 ar arm3 m 117 106 142 82.0 32.6 66.0 44.9 38.2 18.2 ar arm4 m 117 101 144 82.4 32.8 67.5 45.3 41.5 19.0 ar arm5 m 117 103 149 82.8 35.1 70.3 48.3 43.7 17.8 ar arm6 m 119 101 143 81.5 34.1 69.1 50.1 41.1 18.7 ar arm7 m 115 102 146 81.4 33.7 66.4 47.7 42.0 18.2 ar arm8 m 117 100 144 81.3 37.2 66.8 41.4 37.6 17.7 ar arm9 m 114 102 141 84.1 31.8 67.8 47.8 37.8 17.2 ar arm10 m 110 94 132 76.9 30.1 62.1 42.0 40.4 18.1 ar arf1 f 112 94 134 79.5 32.1 63.3 44.9 42.7 17.7 ar arf2 f 109 91 133 77.9 30.6 61.9 45.2 41.2 17.1 ar arf3 f 112 99 139 77.2 32.7 67.4 46.9 40.9 18.3 ar arf4 f 112 99 133 78.5 32.5 65.5 44.2 34.1 17.5 ar arf5 f 113 97 146 84.2 35.4 68.7 51.0 43.6 17.2 ar arf6 f 107 97 137 78.1 30.7 61.6 44.9 37.3 16.5 ; run; OPTIONS NODATE NONUMBER; TITLE2 h=1 j=l 'Nearest Neighbor Cluster Analysis of Wolf data'; PROC CLUSTER DATA=wolves OUTTREE=wolves2 METHOD=density TRIM = .1 K=3; VAR x1-x8; ID wolf; RUN; TITLE2 h=1 j=l 'Tree diagram from the cluster analysis'; PROC TREE DATA=wolves2 horizontal; ID wolf; run; /* =================================== Example 3: cluster analysis taken from Example 4 of the SAS documentation to PROC CLUSTER =====================================*/ options nocenter nodate pageno=1 linesize=132; title h = 1 j = l 'File: cluster.mammalsteeth.sas'; title2 h = 1 j = l 'Cluster Analysis of Mammals'' teeth data'; data teeth; input mammal $ 1-16 @21 (v1-v8) (1.); label v1='Top incisors' v2='Bottom incisors' v3='Top canines' v4='Bottom canines' v5='Top premolars' v6='Bottom premolars' v7='Top molars' v8='Bottom molars'; cards; BROWN BAT 23113333 MOLE 32103333 SILVER HAIR BAT 23112333 PIGMY BAT 23112233 HOUSE BAT 23111233 RED BAT 13112233 PIKA 21002233 RABBIT 21003233 BEAVER 11002133 GROUNDHOG 11002133 GRAY SQUIRREL 11001133 HOUSE MOUSE 11000033 PORCUPINE 11001133 WOLF 33114423 BEAR 33114423 RACCOON 33114432 MARTEN 33114412 WEASEL 33113312 WOLVERINE 33114412 BADGER 33113312 RIVER OTTER 33114312 SEA OTTER 32113312 JAGUAR 33113211 COUGAR 33113211 FUR SEAL 32114411 SEA LION 32114411 GREY SEAL 32113322 ELEPHANT SEAL 21114411 REINDEER 04103333 ELK 04103333 DEER 04003333 MOOSE 04003333 ; /* principal components analysis of teeth here we score the principal components and output then to data set teeth2 */ proc princomp data=teeth out=teeth2; var v1-v8; run; /* average linkage cluster analysis a dendrogram (tree diagram) is also output */ proc cluster data=teeth2 method=average outtree=ttree ccc pseudo rsquare; var v1-v8; id mammal; run; /* --- PROC TREE prints the tree diagram here we also output a data set, called ttree2 that contains four clusters --- */ proc tree data=ttree out=ttree2 nclusters=4; id mammal; run; /* --- the next set of statements sort the data sets by variable mammal and then merge the tree data set (with the cluster scores) with the teeth data set (with the prinicipal components) --- */ proc sort data=teeth2; by mammal; run; proc sort data=ttree2; by mammal; run; data teeth3; merge teeth2 ttree2; by mammal; run; /* --- stuff for plotting --- */ symbol1 c=black f=, v='1'; symbol2 c=black f=, v='2'; symbol3 c=black f=, v='3'; symbol4 c=black f=, v='4'; proc gplot; plot prin2*prin1=cluster; run; proc sort; by cluster; run; proc print; by cluster; var mammal prin1 prin2; run; /* ========================== Example 4: Cluster Analysis ===========================*/ /****************************************************************/ /* S A S S A M P L E L I B R A R Y */ /* */ /* NAME: CLUSTEX4 */ /* TITLE: DOCUMENTATION EXAMPLE 4 FROM PROC CLUSTER */ /* PRODUCT: STAT */ /* SYSTEM: ALL */ /* KEYS: CLUSTER */ /* PROCS: CLUSTER TREE STANDARD */ /* DATA: */ /* */ /* SUPPORT: WSS UPDATE: */ /* REF: */ /* MISC: */ /* */ /****************************************************************/ data teeth; title 'HIERARCHICAL CLUSTER ANALYSIS OF MAMMALS'' TEETH DATA'; title2 'Evaluating the Effects of Ties'; input mammal $ 1-16 @21 (v1-v8) (1.); label v1='Top incisors' v2='Bottom incisors' v3='Top canines' v4='Bottom canines' v5='Top premolars' v6='Bottom premolars' v7='Top molars' v8='Bottom molars'; cards; BROWN BAT 23113333 MOLE 32103333 SILVER HAIR BAT 23112333 PIGMY BAT 23112233 HOUSE BAT 23111233 RED BAT 13112233 PIKA 21002233 RABBIT 21003233 BEAVER 11002133 GROUNDHOG 11002133 GRAY SQUIRREL 11001133 HOUSE MOUSE 11000033 PORCUPINE 11001133 WOLF 33114423 BEAR 33114423 RACCOON 33114432 MARTEN 33114412 WEASEL 33113312 WOLVERINE 33114412 BADGER 33113312 RIVER OTTER 33114312 SEA OTTER 32113312 JAGUAR 33113211 COUGAR 33113211 FUR SEAL 32114411 SEA LION 32114411 GREY SEAL 32113322 ELEPHANT SEAL 21114411 REINDEER 04103333 ELK 04103333 DEER 04003333 MOOSE 04003333 ; proc cluster data=teeth method=average nonorm outtree=_null_; var v1-v8; id mammal; title3 'Raw Data'; run; proc cluster data=teeth std method=average nonorm outtree=_null_; var v1-v8; id mammal; title3 'Standardized Data'; run; options mprint; *---------------------------------------------------------------------------------+ | | | the macro CLUSPERM randomly permutes observations and does a | | cluster analysis for each permutation. the arguments are as | | follows: | | | | data data set name | | var list of variables to cluster | | id id variable for proc cluster | | method clustering method (and possibly other options) | | nperm number of random permutations | | | +----------------------------------------------------------------------------------; %macro CLUSPERM(data,var,id,method,nperm); *------create temporary data set with random numbers------; data _temp_; set &data; array _random_ _ran_1-_ran_&nperm; do over _random_; _random_=ranuni(835297461); end; run; *------permute and cluster the data------; %do n=1 %to &nperm; proc sort data=_temp_(keep=_ran_&n &var &id) out=_perm_; by _ran_&n; proc cluster method=&method noprint outtree=_tree_&n; var &var; id &id; ; /* extra semicolon needed for macro bug */ run; %end; %mend; *---------------------------------------------------------------------------+ | | | the macro PLOTPERM plots various cluster statistics against the | | number of clusters for each permutation. the arguments are as | | follows: | | | | stats names of variables from tree data set | | nclus maximum number of clusters to be plotted | | nperm number of random permutations | | | +---------------------------------------------------------------------------; %macro PLOTPERM(stat,nclus,nperm); *------concatenate tree data sets for 20 or fewer clusters------; data _plot_; set %do n=1 %to &nperm; _tree_&n(in=_in_&n) %end; ; if _ncl_ <= &nclus; %do n=1 %to &nperm; if _in_&n then _perm_=&n; ; %end; label _perm_='permutation number'; keep _ncl_ &stat _perm_; run; *------plot the requested statistics by number of clusters------; proc plot; plot (&stat)*_ncl_=_perm_; run; %mend; *---------------------------------------------------------------------+ | | | the macro TREEPERM generates cluster-membership variables for a | | specified number of clusters for each permutation. proc print lists | | the objects in each cluster-combination and proc tabulate gives | | the frequencies and means. the arguments are as follows: | | | | var list of variables to cluster (no "-" or ":" allowed) | | id id variable for proc cluster | | meanfmt format for printing means in proc tabulate | | nclus number of clusters desired | | nperm number of random permutations | | | +---------------------------------------------------------------------; %macro TREEPERM(var,id,meanfmt,nclus,nperm); *------create data sets giving cluster membership------; %do n=1 %to &nperm; proc tree data=_tree_&n noprint n=&nclus out=_out_&n(drop=clusname rename=(cluster=_clus_&n)); copy &var; id &id; proc sort; by &id &var; run; %end; *------merge the cluster variables------; data _merge_; merge %do n=1 %to &nperm; _out_&n %end; ; by &id &var; length all_clus $ %eval(3 * &nperm); %do n=1 %to &nperm; substr( all_clus, %eval(1+(&n-1)*3), 3) = put( _clus_&n, 3.); %end; run; *------print and tabulate cluster combinations------; proc sort; by _clus_:; proc print; var &var; id &id; by all_clus notsorted; proc tabulate order=data formchar=' '; class all_clus; var &var; table all_clus, n='FREQ'*f=5. mean*f=&meanfmt*(&var) / rts=%eval(&nperm*3+1); run; %mend; %let vlist=v1 v2 v3 v4 v5 v6 v7 v8; TITLE3 'raw data'; *------cluster raw data with average linkage------; %CLUSPERM( teeth, &vlist, mammal, average, 10); *------plot statistics for the last 20 levels------; %PLOTPERM( _psf_ _pst2_ _ccc_, 20, 10); *------analyze the 4-cluster level------; %TREEPERM( &vlist, mammal, 9.1, 4, 10); title3 'Standardized Data'; *------cluster standardized data with average linkage------; %CLUSPERM( teeth, &vlist, mammal, average std, 10); *------plot statistics for the last 20 levels------; %PLOTPERM( _psf_ _pst2_ _ccc_, 20, 10); *------analyze the 4-cluster level------; %TREEPERM( &vlist, mammal, 9.1, 4, 10);