/* 1.School: Contains the name of each school 2.School_Type: Coded 'LibArts' for liberal arts and 'Univ' for university 3.SAT: Median combined Math and Verbal SAT score of students 4.Acceptance: % of applicants accepted 5.$/Student: Money spent per student in dollars 6.Top 10%: % of students in the top 10% of their h.s. graduating class 7.%PhD: % of faculty at the institution that have PhD degrees 8.Grad%: % of students at institution who eventually graduate */ OPTIONS NODATE; DATA EDUCATION; INFILE 'A:/colleges.dat'; INPUT #1 @1 (NAME) ($18.) @20 (TYPE) ($8.) @36 (SAT) (4.) @44 (ACCEPT) (2.) @51 (SPENT) (7.) @60 (Top_HS) (2.) @67 (PHD) (3.) @76 (GRADS) (2.); /* ==========================*/ /* Example1: Get familiar with PROC PRINT */ /* ==========================*/ PROC PRINT DATA=EDUCATION; RUN; PROC PRINT DATA=EDUCATION; VAR TYPE SPENT SAT; ID NAME; /* The ID option */ SUM SPENT SAT; /* The SUM option */ RUN; /* ================ */ /* Example2: PROC MEANS */ /* ================ */ PROC MEANS DATA=EDUCATION MAXDEC = 2 /* options */ RANGE MEAN STD STDERR SKEWNESS KURTOSIS /* key words for descriptive statistics */ CLM T PRT; VAR SAT ACCEPT SPENT TOP_HS PHD GRADS; RUN; /* The VARDEF Option - Affect only VAR and STD */ /* Note that with VARDEF=n, STDERR is missing. */ PROC MEANS DATA=EDUCATION MAXDEC = 2 VARDEF=n RANGE MEAN STD STDERR SKEWNESS KURTOSIS; VAR SAT ACCEPT SPENT TOP_HS PHD GRADS; RUN; /* Example 2: PROC FREQ */ PROC FREQ DATA = EDUCATION; TABLES TYPE PHD GRADS; RUN; /* ======================== */ /* Example 3 Working with Missing data */ /* ======================== */ /* 1.School: Contains the name of each school 2.School_Type: Coded 'LibArts' for liberal arts and 'Univ' for university 3.SAT: Median combined Math and Verbal SAT score of students 4.Acceptance: % of applicants accepted 5.$/Student: Money spent per student in dollars 6.Top 10%: % of students in the top 10% of their h.s. graduating class 7.%PhD: % of faculty at the institution that have PhD degrees 8.Grad%: % of students at institution who eventually graduate */ Options NODATE; DATA EDUC; INFILE 'A:/college-missing.dat'; INPUT #1 @1 (NAME) ($18.) @20 (TYPE) ($8.) @36 (SAT) (4.) @44 (ACCEPT) (2.) @51 (SPENT) (7.) @60 (Top_HS) (2.) @67 (PHD) (3.) @76 (GRADS) (2.); PROC MEANS DATA = EDUC NMISS; RUN; /* The MISSING option treats missing values as nonmissing and includes them */ /* in calculations of percentages and other statistics. */ /* The MISSPRINT option displays missing value frequencies for all tables, */ /* even though PROC FREQ does not use the frequencies in the calculation of statistics. */ PROC FREQ DATA = EDUC; TABLE GRADS; /* default */ TABLES GRADS / MISSPRINT; /* the MISSPRINT option */ TABLES GRADS / MISSING; /* the MISSING option */ RUN; /* ====================*/ /* Example 4: PROC UNIVARIATE */ /* =================== */ /* Back to the data containing no missing */ PROC UNIVARIATE DATA = EDUCATION PLOT; VAR SAT; RUN; PROC UNIVARIATE DATA = EDUCATION PLOT NORMAL; VAR SAT TOP_HS; PROBPLOT SAT/NORMAL(MU=EST SIGMA=EST); /* Probability Plot */ QQPLOT SAT/NORMAL(MU=EST SIGMA=EST); /* Quantile-Quantile Plot */ HISTOGRAM SAT/NORMAL BARWIDTH=5 MIDPOINTS=1000 TO 1500 BY 25; /* Histogram */ RUN; /* The ID Statement */ PROC UNIVARIATE DATA = EDUCATION PLOT NORMAL; VAR SAT; ID NAME; RUN; /* The BY Statement */ PROC UNIVARIATE DATA = EDUCATION PLOT NORMAL; VAR SAT; BY TYPE; RUN; /* ============================*/ /* Example 5: Pearson Correlation Coefficient */ /* ============================ */ /* 1.School: Contains the name of each school 2.School_Type: Coded 'LibArts' for liberal arts and 'Univ' for university 3.SAT: Median combined Math and Verbal SAT score of students 4.Acceptance: % of applicants accepted 5.$/Student: Money spent per student in dollars 6.Top 10%: % of students in the top 10% of their h.s. graduating class 7.%PhD: % of faculty at the institution that have PhD degrees 8.Grad%: % of students at institution who eventually graduate */ Options NODATE; DATA EDUCATION; INFILE 'A:/college.dat'; INPUT #1 @1 (NAME) ($18.) @20 (TYPE) ($8.) @36 (SAT) (4.) @44 (ACCEPT) (2.) @51 (SPENT) (7.) @60 (Top_HS) (2.) @67 (PHD) (3.) @76 (GRADS) (2.); PROC PRINT; RUN; /* Assess Normailty */ PROC UNIVARIATE NORMAL; VAR SAT ACCEPT TOP_HS; RUN; /* Check linear association */ PROC PLOT HPCT=50 VPCT=75; PLOT SAT*ACCEPT; PLOT SAT*Top_HS; PLOT ACCEPT*TOP_HS; Run; /* Compute Pearson Correlation Coefficient */ PROC CORR; VAR SAT ACCEPT; Run; PROC CORR; VAR SAT ACCEPT TOP_HS; RUN; PROC CORR; VAR SAT; WITH ACCEPT TOP_HS; RUN; /* Spearman Correlation Coefficient */ PROC CORR SPEARMAN; VAR SPENT PHD; Run; PROC CORR; VAR SPENT PHD; Run; /* ================================================ */ /* EXAMPLE 6; Chi-Squared Tests for Dependence between Nominal Variables */ /* ================================================ */ /* Example 6.1 work with the raw data */ data pop_kids; infile 'A:/popkids.dat'; input Gender$ Grade Age Race$ Urban_Rural$ School$ Goals$ Grades Sports Looks Money; PROC PRINT; RUN; PROC FREQ; TABLE Gender*Goals; TABLE Race*Goals; RUN; /* With options */ PROC FREQ; TABLE Gender*Goals / chisquare expected exact; TABLE Race*Goals / chisquare expected exact; RUN; /* Example 6.2: working with summarized frequency table directly */ data pop_kids; input Gender$ Goals$ Frequency; cards; Boy Grades 117 Boy Popular 50 Boy Sports 60 Girl Grades 130 Girl Popular 91 Girl Sports 30 ; proc freq; TABLES Gender*Goals /chisq expected exact; WEIGHT frequency; /* Specify the varaible containing counts */ run;