We've discovered a useful way to get a little help with our work by using the PRXMATCH function. It has allowed you to create a much more succinct code, which does away with the creation of unnecessary fields and, furthermore, dispenses with the need for numerous data checks.
/* This program assumes postcode is in one field
Variable names used in this code:
**** Postal_code *****
**** Postcode_Format *****
**** Postcode_fault *****
**** P1 P2 P3 P4 P5 P6 P7 *****
**** Output = Postcode_cleansed *****
Please note this program will remove any records with invalid postcodes
This program makes use of PRXMATCH and PRXPARS.
SAS Version 9 is required.
Built by REW - 20/02/2006
*/
data work.abc;
input postcode $8.;
cards;
RG109UU
RG9 1T
CV1 5HW
CV 7AA
;
run;
%let Data_set =work.abc; /*Dataset with postcode to be checked*/
%let postcode_field =postcode; /*Add your input UPPERCASE postcode field*/
data &Data_set; /*remove 'spaces'*/
set &Data_set;
Postal_code=compress(&postcode_field,' ');
run;
data &Data_set; /*Identify the format of the postcode*/
format Postcode_fault $10.;
length P1 P2 P3 P4 P5 P6 P7 $1.;
set &Data_set;
If prxmatch("/[A-Z]/",SUBSTR(Postal_code,1,1))THEN P1='A'; else P1='N' ;
If prxmatch("/[A-Z]/",SUBSTR(Postal_code,2,1))THEN P2='A'; else P2='N' ;
If prxmatch("/[A-Z]/",SUBSTR(Postal_code,3,1))THEN P3='A'; else P3='N' ;
If prxmatch("/[A-Z]/",SUBSTR(Postal_code,4,1))THEN P4='A'; else P4='N' ;
If prxmatch("/[A-Z]/",SUBSTR(Postal_code,5,1)) and ^prxmatch("/d/",SUBSTR(Postal_code,5,1))
THEN P5='A'; else P5='N';
If prxmatch("/[A-Z]/",SUBSTR(Postal_code,6,1))THEN P6='A'; else if length(strip(Postal_code))>=6
and SUBSTR(Postal_code,6,1)=' ' then P6=' '; else if prxmatch("/d/",SUBSTR(Postal_code,6,1)) then P6='N';
If prxmatch("/[A-Z]/",SUBSTR(Postal_code,7,1)) THEN P7='A';else if SUBSTR(Postal_code,7,1)=' ' then P7=' ';
else if prxmatch("/d/",SUBSTR(Postal_code,7,1)) then P7='N';
Postcode_Format=P1||P2||P3||P4||P5||P6||P7;
run;
data &Data_set; /*Check postcode to see if it complies with a valid layout*/
format Postcode_fault $10.;
set &Data_set;
if Postal_code=' ' then Postcode_fault='Missing';
Else If strip(Postcode_Format) not in('ANNAA' 'ANNNAA','AANNAA','AANNNAA','ANANAA','AANANAA')
then Postcode_fault='invalid';
Else if substr(upcase(Postal_code),1,1) in ('Q','V','X') then Postcode_fault='1';
Else if substr(upcase(Postal_code),2,1) in ('I','J','Z') then Postcode_fault='2';
Else if substr(upcase(Postal_code),3,1) not in ('A','B','C','D','E','F','G','H','J','K','S','T','U','W')
and prxmatch("/[A-Z]/",SUBSTR(Postal_code,3,1)) then Postcode_fault='3';
Else if substr(upcase(Postal_code),6,1) in ('C','I','K','M','O','V') then Postcode_fault='6';
Else if substr(upcase(Postal_code),7,1) in ('C','I','K','M','O','V') then Postcode_fault='7';
else postcode_fault='';
run;
/*
PERFORM COUNTS AND DATA CHECKS BEFORE RUNNING NEXT SECTION
*/
data &Data_set;/*keep records with valid postcode layouts and those without a postcode_fault*/
set &Data_set;
where Postcode_Format in ('ANNAA' 'ANNNAA','AANNAA','AANNNAA','ANANAA','AANANAA')
and Postcode_fault='';
run;
data &Data_set; /*output corrected postcode format (with spaces)*/
length Postcode_cleansed $8.;
set &Data_set;
If Postcode_Format='ANNAA' then Postcode_cleansed=substr(postal_code,1,2)||' '||substr(postal_code,3,3);
If Postcode_Format='ANNNAA' then Postcode_cleansed=substr(postal_code,1,3)||' '||substr(postal_code,4,3);
If Postcode_Format='AANNAA' then Postcode_cleansed=substr(postal_code,1,3)||' '||substr(postal_code,4,3);
If Postcode_Format='AANNNAA' then Postcode_cleansed=substr(postal_code,1,4)||' '||substr(postal_code,5,3);
If Postcode_Format='ANANAA' then Postcode_cleansed=substr(postal_code,1,3)||' '||substr(postal_code,4,3);
If Postcode_Format='AANANAA' then Postcode_cleansed=substr(postal_code,1,4)||' '||substr(postal_code,5,3);
run;
data &Data_set (drop=Postcode_Format Postal_code Postcode_fault P1 P2 P3 P4 P5 P6 P7); /*Tidyup*/
set &Data_set;
run;
For more information on Perl expressions, visit Newsletter Issue 11, October - November 2004.