proc sql: tips and translations for data step users by: gail jorgensen susan marcella
TRANSCRIPT
PROC SQL: Tips and Translations for Data Step Users
By: Gail Jorgensen Susan Marcella
AGENDA
• SQL Syntax Review• Joins Translated• SQL Strengths & Uses
PROC SQL: Tips and Translations for Data Step UsersSyntax
Proc SQL; create table/view newdsname as select var1, var2, … varN
from dsname
where condition ;Quit;
PROC SQL: Tips and Translations for Data Step Users JOIN vs MERGE
Types of JOINs• Inner Join – selects only matching records
(same as: if ina and inb)
• Outer Join – selects some non-matching records– Left Join – selects all records from first table, only
matching records from second(same as: if ina)
– Right join – selects all records from second table, only matching records from first
(same as: if inb)
-- Full join – selects all records from both tables(same as having no if statement)
PROC SQL: Tips and Translations for Data Step UsersInner Join
proc sql; create table ds_c as select ds_a1.*, ds_b.* from ds_a1, ds_b where ds_a1.idfld = ds_b.idno;quit;
data c; merge ds_a1(in=ina) ds_b(in=inb rename=(idno=idfld)); by idfld; if ina and inb;run;
Dataset DS_A1 Dataset DS_BBidfld col2
1 M1 N
2 O3 P
4 Q5 R
idno col3 col41 X C2 X D2 Y F4 Z5 Z7 Z
idfld col2 idno col3 col41 M 1 X C1 N 1 X C2 O 2 X D2 O 2 Y F4 Q 4 Z5 R 5 Z
idfld col2 col3 col41 M X C1 N X C2 O X D2 O Y F4 Q Z5 R Z
PROC SQL: Tips and Translations for Data Step UsersLeft Join
idfld col2 col51 M A1 N D2 O3 P J4 Q K5 R N
proc sql; create table sql_left as select a.*, b.* from inf_a as a left join inf_b as b on a.idfld = b.idfld;quit;
data ds_left; merge inf_a(in=ina) inf_b(in=inb); by idfld; if ina ;run;
idfld col3 col41 X C2 X D2 Y F4 Z5 Z7 Z
Dataset inf_a Dataset inf_b
idfld col2 col5 col3 col41 N D X C
1 M A X C
2 O Y F
2 O X D
3 P J
4 Q K Z
5 R N Z
idfld col2 col5 col3 col41 N D X C1 M A X C2 O Y F2 O X D3 P J4 Q K Z5 R N Z
Dataset sql_left Dataset ds_left
PROC SQL: Tips and Translations for Data Step UsersRight Join
proc sql;
create table sql_right as
select a.*, b.*
from inf_a as a right join inf_b as b
on a.idfld = b.idfld;
quit;
data ds_right; merge inf_a(in=ina) inf_b(in=inb); by idfld; if inb;run;
idfld col2 col5 col3 col41 N D X C
1 M A X C
2 O Y F
2 O X D
4 Q K Z
5 R N Z
. Z
idfld col2 col5 col3 col41 M A X C
1 N D X C
2 O X D
2 O Y F
4 Q K Z
5 R N Z
7 Z
PROC SQL: Tips and Translations for Data Step Users Full Join
Obs name recd sent1 Amanda yes no
2 Gabi yes yes
3 Jan yes yes
4 Jim no yes
5 Pam no no
Obs name recd sent1 Alison yes yes
2 Jan yes yes
3 Pam no no
4 Tom yes yes
CList07CList08
proc sql;create table sql_clist asselect c7.name, c7.recd as recd07, c8.recd as recd08, c7.sent as sent07, c8.sent as sent08from clist07 as c7 full join clist08 as c8on c7.name=c8.name;quit;
proc sort data=clist07; by name; run;proc sort data=clist08; by name; run;data data_clist;merge clist07 clist08 (rename=(recd=recd08 sent=sent08));by name;run;
PROC SQL: Tips and Translations for Data Step Users Full Join (Con’t)
Obs Name recd07 recd08 sent07 sent081 yes yes
2 Amanda yes no
3 Gabi yes yes
4 Jan yes yes yes yes
5 Jim no yes
6 Pam no no no no
7 yes yes
Obs name recd sent recd08 sent081 Alison yes yes
2 Amanda yes no
3 Gabi yes yes
4 Jan yes yes yes yes
5 Jim no yes
6 Pam no no no no
7 Tom yes yes
Sql_CList Data_CList
PROC SQL: Tips and Translations for Data Step UsersHandling Duplicate Variable Names
To always select the variable from one dataset:– Drop unwanted version of variable
(PROC SQL permits all SAS dataset options)
– Select variable from specific table
To keep variable from both tables:– Rename the variable in one dataset
To select variable based on value:– Use CASE statement
PROC SQL: Tips and Translations for Data Step UsersCASE Statement
proc sql;create table NewCList asselect case when missing(c7.name) then c8.name
else c7.name end as name, c7.recd as recd07, c8.recd as recd08, c7.sent as sent07, c8.sent as sent08from clist07 as c7 full join clist08 as c8on c7.name=c8.name;quit;
proc sort data=clist07; by name; run;proc sort data=clist08; by name; run;data data_clist;merge clist07 clist08 (rename=(recd=recd08 sent=sent08));by name;run;
PROC SQL: Tips and Translations for Data Step UsersCASE Statement - Results
Obs name recd07 recd08 sent07 sent081 Alison yes yes
2 Amanda yes no
3 Gabi yes yes
4 Jan yes yes yes yes
5 Jim no yes
6 Pam no no no no
7 Tom yes yes
PROC SQL: Tips and Translations for Data Step UsersDown Calculations
PROC SORT data=shs.exposure; by subject_id; run;DATA counters(KEEP=TableName MAXOBS TOTOBS); SET shs.exposure END=LAST; BY subject_id;length TableName $ 50;RETAIN MAXOBS OBSCNTR TOTOBS 0;TableName=“exposure";TOTOBS+1;OBSCNTR+1;IF LAST.subject_id THEN DO; IF MAXOBS < OBSCNTR THEN MAXOBS=OBSCNTR; OBSCNTR=0;END;IF LAST THEN OUTPUT chemcnts;label maxobs='Maximum number of obs per person' totobs='Total Number obs in table';run;proc print data=counters;run;
Obs TableName MAXOBS TOTOBS
1 exposure 14 2124
PROC SQL: Tips and Translations for Data Step UsersDown Calculations
proc sql;create table sqlcounter asselect distinct subject_id, count(*) as subjcntfrom fshs.exposuregroup by subject_id;
select “exposure" as TableName, max(subjcnt) as MaxObs, sum(subjcnt) as TotObsfrom sqlcounter;quit;
TableName MaxObs TotObs
exposure 14 2124
Obs subject_id subjcnt
1 178 7
2 306 1
3 307 1
4 308 1
5 326 3
6 330 1
7 331 1
8 332 1
sqlcounter
PROC SQL: Tips and Translations for Data Step UsersCounts and Nesting Queries
proc sql;select distinct genre, count(*)from itunesgroup by genre;quit;
proc sql outobs=1;select (select count(*) from itunes) as TotalSongs, (select count(distinct genre) from itunes) as GenreCnt, (select count(distinct artist) from itunes) as ArtistCnt, (select count(distinct album) from itunes) as AlbumCntfrom itunes;quit;
Genre
Alternative 9
Bluegrass 43
Blues 14
Children's Music 62
Christian & Gospel 88
Classical 74
Country 77
Easy Listening 31
Electronic 1
Folk 16
General Folk 18
Gospel & Religious 40
Hip Hop/Rap 2
Holiday 13
Inspirational 70TotalSongs GenreCnt ArtistCnt AlbumCnt
802 23 160 100
PROC SQL: Tips and Translations for Data Step UserDictionaries
proc sql; create view detail as select * from dictionary.columns
; create view extern as select * from dictionary.members
; create view tbl as select * from dictionary.tables
; create view gotem as select trim(libname) as LibName, trim(memname) as TableName, trim(name) as ColName, label as ColLabel
from sashelp.vcolumn ;quit;
SAS
PROC SQL: Tips and Translations for Data Step UserDictionaries – Getting variable names
proc sql;
/* get names of all variables you want */
select name into :drinkvars separated by ', '
from dictionary.columns
where libname=‘AUG' and memname='DEMOG' and
lowcase(name) contains ‘ndrk';
/* use your newly created macro variable in your select statement */
create table drinks as
select &drinkvars
from aug.demog;
quit;
PROC SQL: Tips and Translations for Data Step UserDictionaries – Getting variable names
proc sql;/* add the table alias to the front of each variable name
as you create your macro variable */ select 'd.'||name into :aliasvars separated by ', 'from dictionary.columnswhere libname='AUG' and memname='DEMOG' and
lowcase(name) contains ('ndrk');
/* do your merge or whatever using the macro variable you just created */ create table newtable as select &aliasvars, c.expcategoryfrom aug.demog as d left join aug.exposure as c on d.jcml_id=c.jcml_id;
quit;
PROC SQL: Tips and Translations for Data Step UserViews
• Views are ‘virtual tables’• Created with CREATE VIEW statement• Can be used as if they are normal physical tables• Enhance security
– can construct a view of only fields and rows that user is allowed to view• Enhance ease-of-use
– Can combine rows and columns from multiple tables into a single view• Facilitate data integrity
– Can have several views on the same table, but only have to update the base table– Users always see up-to-date data
proc sql; create view aug.testview as select d.subject_id, d.case_id, d.age, e.job_num, e.exposure_element from aug.demog as d, aug.exposure as e where d.subject_id=e.subject_id;quit;
PROC SQL: Tips and Translations for Data Step Users Creating Data Source Indicators
Obs famid name inc1 2 Art 22000
2 1 Bill 30000
3 3 Paul 25000
4 4 Karl 95000
dads Obs famid faminc96 faminc97 faminc981 3 75000 76000 77000
2 1 40000 40500 41000
3 2 45000 45400 45800
4 5 55000 65000 70000
5 6 22000 24000 28000
faminc
proc sql; create table sql_fj as select *, (dads.famid=faminc.famid) as indic, (dads.famid ~=.) as dadind, (faminc.famid ~=.) as famind, coalesce(dads.famid, faminc.famid) as fid from dads full join faminc on dads.famid=faminc.famid; quit;
proc sort data=dads out=sorted_dads; by famid; run; proc sort data=faminc out=sorted_faminc; by famid; run; data ds_fj; merge sorted_dads(in=in1) sorted_faminc(in=in2); by famid; if in1 and in2 then indic=1; else indic=0; dadind=in1; famind=in2; fid=famid; run;
PROC SQL: Tips and Translations for Data Step UsersFull Join - cont
Obs famid name inc faminc96 faminc97 faminc98 indic dadind famind fid1 1 Bill 30000 40000 40500 41000 1 1 1 1
2 2 Art 22000 45000 45400 45800 1 1 1 2
3 3 Paul 25000 75000 76000 77000 1 1 1 3
4 4 Karl 95000 . . . 0 1 0 4
5 . . 55000 65000 70000 0 0 1 5
6 . . 22000 24000 28000 0 0 1 6
Obs famid name inc faminc96 faminc97 faminc98 indic dadind famind fid1 1 Bill 30000 40000 40500 41000 1 1 1 1
2 2 Art 22000 45000 45400 45800 1 1 1 2
3 3 Paul 25000 75000 76000 77000 1 1 1 3
4 4 Karl 95000 . . . 0 1 0 4
5 5 . 55000 65000 70000 0 0 1 5
6 6 . 22000 24000 28000 0 0 1 6
Sql_fj
Ds_fj
PROC SQL: Tips and Translations for Data Step UsersAdditional Uses
proc sql;title 'Bad Control Matches';select c.subject_id, c.casenum, c.gender, age as CntlAge label='CntlAge', (select age from cases where subject_id=c.casenum) as CaseAge, abs(cntlage-calculated caseage) as AgeDifffrom controls as c left join demog as don c.subject_id=d.subject_idwhere (not within5 and not within10);quit;
Scenario:
For a case/control study, verify that all controls have age within 5 to 10 years of the related case age.
PROC SQL: Tips and Translations for Data Step UserMerging Multiple Tables
proc sql; create table sql_c3 as select a.name, b.class, case when missing(c.grade1) then "missing 1“ when missing(c.grade2) then "missing 2“ when missing(c.grade3) then "missing 3“ when missing(c.grade4) then "missing 4"
else "none missing“ end as miss_grade from indat_a as a, indat_b as b, indat_c as c where c.classid=b.classid and c.perid=a.perid;quit;
Obs name class miss_grade1 Mary Art none missing
2 Olive Art none missing
3 Quincy Art none missing
4 Nat Art none missing
5 Pat Art none missing
6 Quincy Music missing 4
7 Richard Music none missing
8 Mary Math none missing
9 Nat Math none missing
10 Olive Math none missing
11 Pat Math none missing
12 Quincy Math missing 2
13 Richard Math none missing
14 Mary English none missing
15 Nat English none missing
16 Olive English none missing
17 Pat English none missing
18 Quincy English none missing
19 Richard English none missing
SAS and all other SAS Institute Inc. product or service names are registered trademarks or trademarks of SAS Institute Inc. in the USA and other countries. ® indicates USA registration.