<#
Name: summit_sessions_scrape.ps1
Author: Amy Herold
Date: 27 March 2017
Purpose: Pull down all the session submitted to PASS Summit 2017. Because data. And because you want to.
14 June 2017
Updates:
Abstract field is now varchar(max). Cause precons get all the chars.
Upper bound to include precons.
Added SubmittedSession field.
Note: You will need to create the following table in a database to write all this to -
CREATE TABLE [dbo].[SubmittedSessions8](
[id] [int] IDENTITY(1,1) NOT NULL,
[Title] [varchar](500) NULL,
[Speaker] [varchar](100) NULL,
[Duration] [varchar](50) NULL,
[Track] [varchar](200) NULL,
[Abstract] [varchar](max) NULL,
[webid] [int] NULL,
[url] [varchar](1000) NULL,
[deleted] [bit] NULL CONSTRAINT [df_deleted8] DEFAULT ((0)),
[speaker_url] [varchar](1000) NULL,
[speaker_id] [int] NULL,
[SubmittedSession] [bit] NULL
) ON [PRIMARY]
http://www.pass.org/summit/2017/Speakers/CallforSpeakers/SubmittedSessions.aspx
#>
#-----------variables you need to set-----------------
$server = 'SERVER\INSTANCE';
$database = 'Summit2017';
$lower = 65066
$upper = 66731
#-----------------------------------------------------
for ([int]$i = $lower; $i -le $upper; $i++)
{
$url = 'http://www.pass.org/summit/2017/Speakers/CallForSpeakers/SubmittedSessions/SessionDetails.aspx?sid='+$i.ToString();
$url;
#----------------go scrape the web page------------
$webresponse = Invoke-Webrequest $url;
#---------------save the innerText to a variable for parsing--------------
$elements = $webresponse.AllElements | where {$_.class -eq 'pass-orator-session-details'} | select -First 1 -ExpandProperty innerText
#----------------get the link to the speaker page and the speakerid--------------
$speaker_url = $webresponse.Links | Select-Object -ExpandProperty href | Where-Object {$_ -like '*/summit/2017/Speakers/CallforSpeakers/SubmittedSessions/*'} | select -First 1
$speakerid = '';
$speakerid = $speaker_url.split("=")[1]
#---------------clear and fill $resultsArray with the remaining data-----------
$resultArray = @()
foreach($element in $elements.Split("`n")){
if($element.Length -gt "2")
{$resultArray += $element}
}
#--------------clear and populate $blurb with the session deatils-----------
$blurb = '';
for ([int]$k = 4; $k -le $resultArray.Count; $k++)
{
$out = $resultArray[$k]+"`r`n`r`n"
$blurb += $out
}
#--------------clear $sqlstmt and populate the rest of our variables with information from the session page----------------
#--------------replace instances of on single quote with two so inserts with these will work-------------------------------
$sqlstmt = '';
$title = $resultArray[0] -replace "'","''"
$speaker = $resultArray[1] -replace "'","''"
$duration = $resultArray[2]
$track = $resultArray[3]
#--------------do some formatting and fix the single quotes in the abstract--------------
$title2 = "'"+$title+"'"
$speaker2 = "'"+$speaker+"'"
$duration2 = "'"+$duration+"'"
$track2 = "'"+$track+"'"
$webid = $i.ToString();
$blurb2 = $blurb -replace "'","''"
#-------------create our insert statement and run it-----------
$sqlstmt = "insert SubmittedSessions (Title, Speaker, Duration, Track, Abstract, webid, url, speaker_url, speaker_id) values ($title2,$speaker2,$duration2,$track2,'$blurb2',$webid, '$url','$speaker_url',$speakerid)";
Invoke-Sqlcmd -ServerInstance $server -Database $database -Query $sqlstmt
}