Tuesday, June 11, 2013

BASH: Regular Express Extract Substring Samples

Sample 1: download files in html source


#!/bin/bash

inputFile=$1

exampleRegExp=".*href=\"(.*.[:alpha:]{1,4})[\"]{1}.*"
while read line
do

#if [[ $exampleRegExp =~ $line ]]; then
#   echo $line
#fi

if [[ $line =~ $exampleRegExp ]]; then
   #echo $line
   echo ${BASH_REMATCH[1]}
   (cd src; wget http://jcifs.samba.org/src/examples/${BASH_REMATCH[1]} )
fi

done < ${inputFile}


======================================
Data File Sample

<tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="UrlReader.java">UrlReader.java</a></td><td align="right">18-Oct-2011 15:26  </td><td align="right">1.0K</td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="VerifyGuest.java">VerifyGuest.java</a></td><td align="right">18-Oct-2011 15:26  </td><td align="right">896 </td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="VerifyIO.java">VerifyIO.java</a></td><td align="right">18-Oct-2011 15:26  </td><td align="right">2.1K</td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="VerifyReads.java">VerifyReads.java</a></td><td align="right">18-Oct-2011 15:26  </td><td align="right">2.1K</td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="WaitNamedPipe.java">WaitNamedPipe.java</a></td><td align="right">18-Oct-2011 15:26  </td><td align="right">1.1K</td></tr>

Sample 2: Cut information from html source


 #!/bin/bash
#set -x
inputFile=$1

#<h4 class="sect3"><span class="secnum">1.13.1.2</span> Volume Shadow Copy Service (VSS) Writer</h4>

exampleRegExp=".*secnum\">(.*.[0-9]{1,2})<\/span>( .*)<\/h.*>$"
while read line
do

if [[ $line =~ $exampleRegExp ]]; then
      echo ${BASH_REMATCH[1]} ${BASH_REMATCH[2]}fi

done < $inputFile

Sample 3: Grab information from DDL scripts

 #!/bin/bash
#set -x
#  `Place_ID` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'Event ID',

inputFile=$1

regExp=".*\`(.*)\`[ \t](.*)[\(]{1}([0-9]+).*'(.*)'.*{1}"

while read line
do
  if [[ $line =~ $regExp ]]; then
    echo ${BASH_REMATCH[1]} ${BASH_REMATCH[2]} ${BASH_REMATCH[3]} ${BASH_REMATCH[4]}
  fi
done < sql/$inputFile

Sample 4: backslash in source file


 #!/bin/bash
 #set -x
inputFile=$1

#exampleRegExp=".*href=\"(.*.[:alpha:]{1,4})[\"]{1}.*"
exampleRegExp=".*href=\"([0-9]{4})[\\]{1}(.*.pdf)[\"]{1}.*"
#exampleRegExp="[0-9]{4}\\.*.pdf"
while read -r line
do

#if [[ $exampleRegExp =~ $line ]]; then
#   echo $line
#fi

if [[ $line =~ $exampleRegExp ]]; then
   #echo $line
   echo "${BASH_REMATCH[1]}"
   year="${BASH_REMATCH[1]}"
   echo "${BASH_REMATCH[2]}"
   mkdir -p ${year}
   (cd ${year}; wget http://www.people.okanagan.bc.ca/clee/bcssmc/${BASH_REMATCH[1]}/${BASH_REMATCH[2]} )
fi

done < ${inputFile}

No comments:

Post a Comment