Hello!
Started to use the latest yo.tv.ini and found a problem. The ini is splitting the title when it contains a colon separator. Text on the left side of the colon gets to be the title and text right of the colon gets inserted as a sub-title. For example "X-men Origins: Wolverine" is the correct title. When yo.tv.ini is done it looks like this:
Title: X-men Origins
Sub_title: Wolverine
Tried rooting around in the code to fix it but everything I try fails. Perhaps someone can help out?
The Ini:
**------------------------------------------------------------------------------------------------
* @header_start
* WebGrab+Plus ini for grabbing EPG data from TvGuide websites
* @Site: yo.tv
* @MinSWversion:
* @Revision 4 - [22/04/2016] 1NSdbZVbpZDX
* - fix title to match generic siteini, fix credits details
* added subtitles and production date
* @Revision 3 - [31/01/2016] 1NSdbZVbpZDX
* - fix subtitle, episode and gabs
* @Revision 2 - [28/01/2016] Francis De Paemeleere
* - create a country only channels.xml generation
* @Revision 1 - [27/01/2016] 1NSdbZVbpZDX
* - make a generic siteini
* @Revision 0 - [08/09/2014] Willy De Wilde/Francis De Paemeleere
* - creation
* @Remarks:
* @header_end
**------------------------------------------------------------------------------------------------
site {url=yo.tv|timezone=UTC|maxdays=14.14|cultureinfo=en-GB|charset=UTF-8|titlematchfactor=50|nopageoverlaps}
site {episodesystem=onscreen}
urldate.format {daycounter|0}
url_index{url |http://##COUNTRY##.yo.tv/api/GS?cid=##CHANNEL_ID##,&offset=,&day=|urldate|}
scope.range {(urlindex)|end}
index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
url_index.modify {replace|##COUNTRY##|'index_temp_1'}
index_temp_1.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
url_index.modify {replace|##PROVIDER_ID##|'index_temp_1'}
index_temp_1.modify {substring(type=regex)|'config_site_id' "CHANNEL_ID:([^,]*)"}
url_index.modify {replace |##CHANNEL_ID##|'index_temp_1'}
end_scope
url_index.headers {customheader=Accept-Encoding=gzip,deflate}
urldate.format {daycounter|0}
*index_showsplit.scrub {regex (debug)||<a style='width:(.*?)</a>||}
index_showsplit.scrub {multi |["|<a|</a>|"]}
*index_start.scrub {regex ||data-time='(.*?)'||}
index_start.scrub {single |data-time=|'|'|data-flags}
index_title.scrub {single (separator=": " include=first)|<h2|</i> | </h2>| <h3>}
*index_title.scrub {regex (separator=": " include=first)||>(.*?)\s<\/h2>||} *OLD
index_title.modify {cleanup (tags="<"">")}
index_subtitle.scrub {single (separator=": " exclude=first)|<h2|> | </h2>| <h3>}
index_subtitle.modify {cleanup (tags="<"">")} *NEW
index_temp_3.scrub {regex ||\((\d{4})\).<\/h2>\s<h3>||} *for productiondate
index_title.modify {remove(type=regex)|(.\(\d{4}\)$)}
index_subtitle.modify {remove(type=regex)|(.\(\d{4}\)$)}
scope.range {(indexshowdetails)|end}
index_temp_1.scrub {single|href='||'>|}
index_temp_2.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"} *for urlshow
index_urlshow.modify {set |http://'index_temp_2'.yo.tv'index_temp_1'}
index_urlshow.headers {customheader=Accept-Encoding=gzip,deflate} * to speedup the downloading of the detail pages
end_scope
scope.range {(showdetails)|end}
title.scrub {single (separator=": " include=first)|bg-title" >||</div>|}
title.modify {remove (type=regex)|(.\(\d{4}\)$)}
subtitle.scrub {single (separator=": " exclude=first)|<h2|>||</h2>}
subtitle.modify {remove (type=regex)|(\s\(\d{4}\))}
subtitle.modify {remove | </h2}
episode.scrub {single |prog box">|<span class="episode" >|</span><h3>|</div></div>}
episode.modify {remove (type=regex)|(:?.of.\d+)}
episode.modify {remove (type=regex)|(.<.*span.*)}
episode.modify {replace |Season |s}
episode.modify {replace | Episode |e}
*episode.modify {replace |Season |S.}
*episode.modify {replace | Episode |-Ep.}
*subtitle.scrub {regex (debug) ||prog box">(?:.*)<span >(.*?)<span >Episode||}
*subtitle.modify {remove (type=regex)|(.<.*span.*)}
subtitle.scrub {single |prog box">|<span >| </span></span>|<h3>}
subtitle.modify {cleanup (tags="<"">")}
subtitle.modify {remove (type=regex)|(.Episode.*)}
subtitle.modify {remove (type=regex)|(Rating.*)}
productiondate.modify {addstart |'index_temp_3'}
*rating.scrub {single |Rating:</span> ||</span>|</div></a></div></li><li}
description.scrub {single |prog box"|<h3>||</h3>}
description.modify {remove |</h3>}
category.scrub {multi |Genre</h2>|<div >|</div>|<div class="}
category.scrub {single |prog box">|><h2 class='|'>|<span}
*********general credits*********
*actor.scrub {regex ||<ul class="cast">(?:.*?)(?:<li><div>(.*?)</div></li>(?:.*?))*</ul>||}
*actor.modify {replace |</div><div class="partname">|[}
*actor.modify {addend (notnull)|]}
*actor.modify {remove | [Actor]}
***************
*specific credits
actor.scrub {multi |id="cast-box" >|<div>| </div><div class|castoverlay" >}
temp_1.scrub {multi |Crew</h2>|<li><div>|</div></li>|</ul>} *debug for more credits
temp_1.modify {replace |</div><div class="partname">|[}
temp_1.modify {addend (notnull)|]}
director.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Director\])"} *spa, eng
director.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Réalisateur\])"} *french
composer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Compositor\])"} *spa, port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Produtora\])"} *port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Produtor\])"} *port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Productor\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Productor ejecutivo\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Compañías productores\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Executive Producer\])"} *eng
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Production Company\])"} *eng
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Société de production\])"} *french
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Reporter\])"} *eng
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Jornalista\])"} *port
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Journaliste\])"} *french
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Writer\])"} *eng
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Autor\])"} *port
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Escritor\])"} *spa
presenter.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Apresentador\])"} *port
showicon.scrub {single |og:image"|content="https:|"| />}
showicon.modify {addstart (notnull)|https:}
end_scope
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### COUNTRY FILE CREATION (only to create the xxx_country.xml file)
**
** @auto_xml_country_start
*url_index{url|http://www.yo.tv}
*index_site_channel.scrub {multi|<ul data-filter="true" data-role="listview">|<li>|</li>|</ul>}
*index_site_id.scrub {multi|<ul data-filter="true" data-role="listview">|href="|"|</ul>}
*scope.range {(channellist)|end}
*index_site_id.modify {substring(type=regex)|http://(.*).yo.tv}
*index_site_id.modify {addstart|COUNTRY:}
*index_site_id.modify {addend|,ZIPCODE:}
*index_site_channel.modify {cleanup(tags="<"">")}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_country_start
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### PROVIDER FILE CREATION (only to create the xxx_provider.xml file)
**
** @auto_xml_provider_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "ZIPCODE:([^,]*)"}
*url_index.modify {replace|##ZIPCODE##|'index_temp_1'}
*end_scope
*
*url_index{url |http://##COUNTRY##.yo.tv/api/setting?id=##ZIPCODE##&lookupid=1}
*index_site_channel.scrub {regex||"Name"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"||}
*index_site_id.scrub {regex||"Value"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"||}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',PROVIDER_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_provider_end
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### CHANNELPROVIDER FILE CREATION (only to create the xxx-channel.xml file)
**
** @auto_xml_channelprovider_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
*url_index.modify {replace |##PROVIDER_ID##|'index_temp_1'}
*end_scope
*
*url_index{url|http://##COUNTRY##.yo.tv/settings/headend/##PROVIDER_ID##}
*index_site_channel.scrub {multi(separator=" " include="alt=""<h2>")|<div id="channelbar" >|<li>|</li>|</ul>}
*index_site_channel.modify {remove |alt=}
*index_site_channel.modify {remove |"}
*index_site_channel.modify {remove |<h2>}
*index_site_channel.modify {remove |</h2>}
*index_site_id.scrub{multi|<div id="content" >|<li id="|"}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_temp_2.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',PROVIDER_ID:'index_temp_2',CHANNEL_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_channelprovider_end
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### CHANNEL FILE CREATION (only to create the xxx-channel.xml file)
**
** @auto_xml_channel_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*end_scope
*
*url_index{url|http://##COUNTRY##.yo.tv/settings/headend/}
*index_site_channel.scrub {multi(separator=" " include="alt=""<h2>")|<div id="channelbar" >|<li>|</li>|</ul>}
*index_site_channel.modify {remove|alt=}
*index_site_channel.modify {remove|"}
*index_site_channel.modify {remove|<h2>}
*index_site_channel.modify {remove|</h2>}
*index_site_id.scrub{multi|<div id="content" >|<li id="|"}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',CHANNEL_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_channel_end
Thanks for the tips, Blackbear199! That was setting me in the right mindframe for me to dare take an another crack at it and this time more successful. There was a few additional steps to do besides those you mentioned. Removed all colon separators, cuz the yo.tv doesnt use them to separate title from subtitle (not as far as i can see anyway and i have looked all over the place). The index.subtitle scrub was searching in the h2 tag where the title resides and i had to change it to search the h3 tag where the sub-title is. I still dont know if I used the entirely correct coding for this, but the overall scrub seems to go well now. Funny thing...scrub time is now more than twice as fast. If title has a colon everything after the colon gets correctly treated as title. Sub-title gets created only provided there is content in the h3 tag.
Best regards
jagad
New yo.tv.ini:
**------------------------------------------------------------------------------------------------
* @header_start
* WebGrab+Plus ini for grabbing EPG data from TvGuide websites
* @Site: yo.tv
* @MinSWversion:
* @Revision 4 - [22/04/2016] 1NSdbZVbpZDX
* - fix title to match generic siteini, fix credits details
* added subtitles and production date
* @Revision 3 - [31/01/2016] 1NSdbZVbpZDX
* - fix subtitle, episode and gabs
* @Revision 2 - [28/01/2016] Francis De Paemeleere
* - create a country only channels.xml generation
* @Revision 1 - [27/01/2016] 1NSdbZVbpZDX
* - make a generic siteini
* @Revision 0 - [08/09/2014] Willy De Wilde/Francis De Paemeleere
* - creation
* @Remarks:
* @header_end
**------------------------------------------------------------------------------------------------
site {url=yo.tv|timezone=UTC|maxdays=14.14|cultureinfo=en-GB|charset=UTF-8|titlematchfactor=50|nopageoverlaps}
site {episodesystem=onscreen} *Added
urldate.format {daycounter|0}
url_index{url |http://##COUNTRY##.yo.tv/api/GS?cid=##CHANNEL_ID##,&offset=,&day=|urldate|}
scope.range {(urlindex)|end}
index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
url_index.modify {replace|##COUNTRY##|'index_temp_1'}
index_temp_1.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
url_index.modify {replace|##PROVIDER_ID##|'index_temp_1'}
index_temp_1.modify {substring(type=regex)|'config_site_id' "CHANNEL_ID:([^,]*)"}
url_index.modify {replace |##CHANNEL_ID##|'index_temp_1'}
end_scope
url_index.headers {customheader=Accept-Encoding=gzip,deflate}
urldate.format {daycounter|0}
*index_showsplit.scrub {regex (debug)||<a style='width:(.*?)</a>||}
index_showsplit.scrub {multi |["|<a|</a>|"]}
*index_start.scrub {regex ||data-time='(.*?)'||}
index_start.scrub {single |data-time=|'|'|data-flags}
index_title.scrub {single|<h2|</i> | </h2>| <h3>} *Changed..colon separator removed
*index_title.scrub {single (separator=": " include=first)|<h2|</i> | </h2>| <h3>} *OLD
*index_title.scrub {regex (separator=": " include=first)||>(.*?)\s<\/h2>||} *OLD
index_title.modify {cleanup (tags="<"">")}
index_subtitle.scrub {single|<h3|> | <span></span>|</h3>} *Changed..no search for subtitle in h2 tags. Subtitle exists in the h3 tag
*index_subtitle.scrub {single (separator=": " exclude=first)|<h2|> | </h2>| <h3>} *OLD
index_subtitle.modify {cleanup (tags="<"">")} *NEW
index_temp_3.scrub {regex ||\((\d{4})\).<\/h2>\s<h3>||} *for productiondate
index_title.modify {remove(type=regex)|(.\(\d{4}\)$)}
index_subtitle.modify {remove(type=regex)|(.\(\d{4}\)$)}
scope.range {(indexshowdetails)|end}
index_temp_1.scrub {single|href='||'>|}
index_temp_2.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"} *for urlshow
index_urlshow.modify {set |http://'index_temp_2'.yo.tv'index_temp_1'}
index_urlshow.headers {customheader=Accept-Encoding=gzip,deflate} * to speedup the downloading of the detail pages
end_scope
scope.range {(showdetails)|end}
title.scrub {single|bg-title" >||</div>|} *Changed..colon separator removed.
*title.scrub {single (separator=": " include=first)|bg-title" >||</div>|}
title.modify {remove (type=regex)|(.\(\d{4}\)$)}
*subtitle.scrub {single (separator=": " exclude=first)|<h2|>||</h2>} *OLD disabled...subtitle doesn't exists in the h2 tag
*subtitle.modify {remove (type=regex)|(\s\(\d{4}\))} *OLD disabled...subtitle doesn't exists in the h2 tag
*subtitle.modify {remove | </h2} *OLD disabled...subtitle doesn't exists in the h2 tag
episode.scrub {single |prog box">|<span class="episode" >|</span><h3>|</div></div>}
episode.modify {remove (type=regex)|(:?.of.\d+)}
episode.modify {remove (type=regex)|(.<.*span.*)}
episode.modify {replace |Season |s} *Changed
episode.modify {replace | Episode |e} *Changed
*episode.modify {replace |Season |S.} *OLD disabled
*episode.modify {replace | Episode |-Ep.} *OLD disabled
*subtitle.scrub {regex (debug) ||prog box">(?:.*)<span >(.*?)<span >Episode||}
*subtitle.modify {remove (type=regex)|(.<.*span.*)}
subtitle.scrub {single |prog box">|<span >| </span></span>|<h3>}
subtitle.modify {cleanup (tags="<"">")}
subtitle.modify {remove (type=regex)|(.Episode.*)}
subtitle.modify {remove (type=regex)|(Rating.*)}
productiondate.modify {addstart |'index_temp_3'}
*rating.scrub {single |Rating:</span> ||</span>|</div></a></div></li><li} *disabled since using imdb star-ratings
description.scrub {single |prog box"|<h3>||</h3>}
description.modify {remove |</h3>}
category.scrub {multi |Genre</h2>|<div >|</div>|<div class="}
category.scrub {single |prog box">|><h2 class='|'>|<span}
*********general credits*********
*actor.scrub {regex ||<ul class="cast">(?:.*?)(?:<li><div>(.*?)</div></li>(?:.*?))*</ul>||}
*actor.modify {replace |</div><div class="partname">|[}
*actor.modify {addend (notnull)|]}
*actor.modify {remove | [Actor]}
***************
*specific credits
actor.scrub {multi |id="cast-box" >|<div>| </div><div class|castoverlay" >}
temp_1.scrub {multi |Crew</h2>|<li><div>|</div></li>|</ul>} *debug for more credits
temp_1.modify {replace |</div><div class="partname">|[}
temp_1.modify {addend (notnull)|]}
director.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Director\])"} *spa, eng
director.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Réalisateur\])"} *french
composer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Compositor\])"} *spa, port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Produtora\])"} *port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Produtor\])"} *port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Productor\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Productor ejecutivo\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Compañías productores\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Executive Producer\])"} *eng
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Production Company\])"} *eng
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Société de production\])"} *french
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Reporter\])"} *eng
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Jornalista\])"} *port
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Journaliste\])"} *french
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Writer\])"} *eng
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Autor\])"} *port
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Escritor\])"} *spa
presenter.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Apresentador\])"} *port
showicon.scrub {single |og:image"|content="https:|"| />}
showicon.modify {addstart (notnull)|https:}
end_scope
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### COUNTRY FILE CREATION (only to create the xxx_country.xml file)
**
** @auto_xml_country_start
*url_index{url|http://www.yo.tv}
*index_site_channel.scrub {multi|<ul data-filter="true" data-role="listview">|<li>|</li>|</ul>}
*index_site_id.scrub {multi|<ul data-filter="true" data-role="listview">|href="|"|</ul>}
*scope.range {(channellist)|end}
*index_site_id.modify {substring(type=regex)|http://(.*).yo.tv}
*index_site_id.modify {addstart|COUNTRY:}
*index_site_id.modify {addend|,ZIPCODE:}
*index_site_channel.modify {cleanup(tags="<"">")}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_country_start
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### PROVIDER FILE CREATION (only to create the xxx_provider.xml file)
**
** @auto_xml_provider_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "ZIPCODE:([^,]*)"}
*url_index.modify {replace|##ZIPCODE##|'index_temp_1'}
*end_scope
*
*url_index{url |http://##COUNTRY##.yo.tv/api/setting?id=##ZIPCODE##&lookupid=1}
*index_site_channel.scrub {regex||"Name"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"||}
*index_site_id.scrub {regex||"Value"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"||}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',PROVIDER_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_provider_end
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### CHANNELPROVIDER FILE CREATION (only to create the xxx-channel.xml file)
**
** @auto_xml_channelprovider_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
*url_index.modify {replace |##PROVIDER_ID##|'index_temp_1'}
*end_scope
*
*url_index{url|http://##COUNTRY##.yo.tv/settings/headend/##PROVIDER_ID##}
*index_site_channel.scrub {multi(separator=" " include="alt=""<h2>")|<div id="channelbar" >|<li>|</li>|</ul>}
*index_site_channel.modify {remove |alt=}
*index_site_channel.modify {remove |"}
*index_site_channel.modify {remove |<h2>}
*index_site_channel.modify {remove |</h2>}
*index_site_id.scrub{multi|<div id="content" >|<li id="|"}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_temp_2.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',PROVIDER_ID:'index_temp_2',CHANNEL_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_channelprovider_end
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### CHANNEL FILE CREATION (only to create the xxx-channel.xml file)
**
** @auto_xml_channel_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*end_scope
*
*url_index{url|http://##COUNTRY##.yo.tv/settings/headend/}
*index_site_channel.scrub {multi(separator=" " include="alt=""<h2>")|<div id="channelbar" >|<li>|</li>|</ul>}
*index_site_channel.modify {remove|alt=}
*index_site_channel.modify {remove|"}
*index_site_channel.modify {remove|<h2>}
*index_site_channel.modify {remove|</h2>}
*index_site_id.scrub{multi|<div id="content" >|<li id="|"}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',CHANNEL_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_channel_end
greyt!
can't feel the more than twice as fast scrub time tho'
tested with many countries and it looks this one fixes some corrections in 'differ from... program' corrections it made when in incremental mode
change:
index_title.scrub {single|<h2|</i> | </h2>| <h3>} *Changed..colon separator removed
to:
index_title.scrub {single|<h2|>| </h2>| <h3>} *Changed..colon separator removed
to fix missing film shows
Do you have some sample where this title goes missing? Haven't seen anything like that before yet. Only spent 1 week studying it though.
Hmm...the yo.tv site is a mess and VERY hard to code a siteini for. For example the big problem is season/episode numbering goes missing far too often (yo.tv must have messed up their own scrub code). That in next turn creates a problem in the showdetails page where if season/episode numbering is missing they are removing the subtitle/episodename, so if using the subtitle.scrub a bunch of shows (with subtitle/episodename on yo.tv) never gets the sub-title element created. Without the sub-title element you never get the chance to fetch the missing season/episode number via mdb postprocessing. So the only option, as I see it, is to scrub the index page for subtitles. Which leads to the next problem. On index page they are using the same h3 tag for season/episode number, epsiode name and description which is amazingly lazy. So if scrubbing the subtitle from index you need to have Jan skills to pull this off in the first place. Im not familiar with the siteini syntax (nor am I a regex fan), so I have no idea how to fix it. The problem i can't solve via siteini coding is when the description is in the h3 tag, the description shows up in the sub-title element and how to remove those cases. The complexity doesn't stop there...the WG++ seem to utilize a maximum limit of characters for the sub-title and when limit is reached ellipses gets added. So matching description against the subtitle in order to prevent the sub-title element from being created, requires escaping of the ellipses, last period (when string is within max char limit a period gets added which is not present in the description) and backslash (wg++ escapes two double quotes "" to blackslash single double quote \") then match and remove. Using Autohotkey with msxml dom such a thing is quite easy, so I had to solve that by postprocess the xmltv file via my own script before running the mdb postprocessor. Maybe one of the WG++ Gods could have a look at this siteini? Im out, cuz solving this requires too much of my time.
This is what Im using now - escaping, match and remove is not done here:
**------------------------------------------------------------------------------------------------
* @header_start
* WebGrab+Plus ini for grabbing EPG data from TvGuide websites
* @Site: yo.tv
* @MinSWversion:
* @Revision 4 - [22/04/2016] 1NSdbZVbpZDX
* - fix title to match generic siteini, fix credits details
* added subtitles and production date
* @Revision 3 - [31/01/2016] 1NSdbZVbpZDX
* - fix subtitle, episode and gabs
* @Revision 2 - [28/01/2016] Francis De Paemeleere
* - create a country only channels.xml generation
* @Revision 1 - [27/01/2016] 1NSdbZVbpZDX
* - make a generic siteini
* @Revision 0 - [08/09/2014] Willy De Wilde/Francis De Paemeleere
* - creation
* @Remarks:
* @header_end
**------------------------------------------------------------------------------------------------
site {url=yo.tv|timezone=UTC|maxdays=14.14|cultureinfo=en-GB|charset=UTF-8|titlematchfactor=50|nopageoverlaps}
site {episodesystem=onscreen}
urldate.format {daycounter|0}
url_index{url |http://##COUNTRY##.yo.tv/api/GS?cid=##CHANNEL_ID##,&offset=,&day=|urldate|}
scope.range {(urlindex)|end}
index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
url_index.modify {replace|##COUNTRY##|'index_temp_1'}
index_temp_1.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
url_index.modify {replace|##PROVIDER_ID##|'index_temp_1'}
index_temp_1.modify {substring(type=regex)|'config_site_id' "CHANNEL_ID:([^,]*)"}
url_index.modify {replace |##CHANNEL_ID##|'index_temp_1'}
end_scope
url_index.headers {customheader=Accept-Encoding=gzip,deflate}
urldate.format {daycounter|0}
*index_showsplit.scrub {regex (debug)||<a style='width:(.*?)</a>||}
index_showsplit.scrub {multi |["|<a|</a>|"]}
*index_start.scrub {regex ||data-time='(.*?)'||}
index_start.scrub {single |data-time=|'|'|data-flags}
*index_title.scrub {single|<h2|</i> | </h2>| <h3>} *OLD Changed..colon separator removed
index_title.scrub {single|<h2|>| </h2>| <h3>} *Changed..colon separator removed to fix missing film shows
*index_title.scrub {single (separator=": " include=first)|<h2|</i> | </h2>| <h3>} *OLD
*index_title.scrub {regex (separator=": " include=first)||>(.*?)\s<\/h2>||} *OLD
index_title.modify {cleanup (tags="<"">")}
index_subtitle.scrub {single|<h3>||<|/h3>} * Changed
*index_subtitle.scrub {single (separator=": " exclude=first)|<h2|> | </h2>| <h3>} *OLD
index_subtitle.modify {cleanup (tags="<"">")} *NEW
index_temp_3.scrub {regex ||\((\d{4})\).<\/h2>\s<h3>||} *for productiondate
index_title.modify {remove(type=regex)|(.\(\d{4}\)$)}
*index_subtitle.modify {remove(type=regex)|(.\(\d{4}\)$)} * OLD not working
*index_subtitle.modify {remove(type=regex)|[^\.]+\. } *NEW removes digit-digit-space-indexsubtitle-period-space and leaves showdetails subtitle
*index_subtitle.modify {remove(type=regex)|(^[\d-]*\s*)} *NEW
index_subtitle.modify {remove(type=regex)|^\d+-\d+ } *NEW removes digit-digit-space only
scope.range {(indexshowdetails)|end}
index_temp_1.scrub {single|href='||'>|}
index_temp_2.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"} *for urlshow
index_urlshow.modify {set |http://'index_temp_2'.yo.tv'index_temp_1'}
index_urlshow.headers {customheader=Accept-Encoding=gzip,deflate} * to speedup the downloading of the detail pages
end_scope
scope.range {(showdetails)|end}
title.scrub {single|bg-title" >||</div>|} *Changed..colon separator removed.
*title.scrub {single (separator=": " include=first)|bg-title" >||</div>|}
title.modify {remove (type=regex)|(.\(\d{4}\)$)}
*subtitle.scrub {single (separator=": " exclude=first)|<h2|>||</h2>} *disabled
*subtitle.modify {remove (type=regex)|(\s\(\d{4}\))} *disabled
*subtitle.modify {remove | </h2} *disabled
episode.scrub {single |prog box">|<span class="episode" >|</span><h3>|</div></div>}
episode.modify {remove (type=regex)|(:?.of.\d+)}
episode.modify {remove (type=regex)|(.<.*span.*)}
episode.modify {replace |Season |s} *Changed
episode.modify {replace | Episode |e} *Changed
*episode.modify {replace |Season |S.} *OLD disabled
*episode.modify {replace | Episode |-Ep.} *OLD disabled
*subtitle.scrub {regex (debug) ||prog box">(?:.*)<span >(.*?)<span >Episode||}
*subtitle.modify {remove (type=regex)|(.<.*span.*)}
*subtitle.scrub {single |prog box">|<span >| </span></span>|<h3>}
*subtitle.modify {cleanup (tags="<"">")}
*subtitle.modify {remove (type=regex)|(.Episode.*)}
*subtitle.modify {remove (type=regex)|(Rating.*)}
productiondate.modify {addstart |'index_temp_3'}
*rating.scrub {single |Rating:</span> ||</span>|</div></a></div></li><li} *disabled since using imdb star-ratings
description.scrub {single |prog box"|<h3>||</h3>}
description.modify {remove |</h3>}
category.scrub {multi |Genre</h2>|<div >|</div>|<div class="}
category.scrub {single |prog box">|><h2 class='|'>|<span}
*********general credits*********
*actor.scrub {regex ||<ul class="cast">(?:.*?)(?:<li><div>(.*?)</div></li>(?:.*?))*</ul>||}
*actor.modify {replace |</div><div class="partname">|[}
*actor.modify {addend (notnull)|]}
*actor.modify {remove | [Actor]}
***************
*specific credits
actor.scrub {multi |id="cast-box" >|<div>| </div><div class|castoverlay" >}
temp_1.scrub {multi |Crew</h2>|<li><div>|</div></li>|</ul>} *debug for more credits
temp_1.modify {replace |</div><div class="partname">|[}
temp_1.modify {addend (notnull)|]}
director.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Director\])"} *spa, eng
director.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Réalisateur\])"} *french
composer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Compositor\])"} *spa, port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Produtora\])"} *port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Produtor\])"} *port
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Productor\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Productor ejecutivo\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Compañías productores\])"} *spa
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Executive Producer\])"} *eng
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Production Company\])"} *eng
producer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Société de production\])"} *french
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Reporter\])"} *eng
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Jornalista\])"} *port
commentator.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Journaliste\])"} *french
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Writer\])"} *eng
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Autor\])"} *port
writer.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Escritor\])"} *spa
presenter.modify {substring (type=regex)|'temp_1' "(.*?)\s(?:\[Apresentador\])"} *port
*showicon.scrub {single |og:image"|content="https:|"| />}
*showicon.modify {addstart (notnull)|https:}
end_scope
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### COUNTRY FILE CREATION (only to create the xxx_country.xml file)
**
** @auto_xml_country_start
*url_index{url|http://www.yo.tv}
*index_site_channel.scrub {multi|<ul data-filter="true" data-role="listview">|<li>|</li>|</ul>}
*index_site_id.scrub {multi|<ul data-filter="true" data-role="listview">|href="|"|</ul>}
*scope.range {(channellist)|end}
*index_site_id.modify {substring(type=regex)|http://(.*).yo.tv}
*index_site_id.modify {addstart|COUNTRY:}
*index_site_id.modify {addend|,ZIPCODE:}
*index_site_channel.modify {cleanup(tags="<"">")}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_country_start
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### PROVIDER FILE CREATION (only to create the xxx_provider.xml file)
**
** @auto_xml_provider_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "ZIPCODE:([^,]*)"}
*url_index.modify {replace|##ZIPCODE##|'index_temp_1'}
*end_scope
*
*url_index{url |http://##COUNTRY##.yo.tv/api/setting?id=##ZIPCODE##&lookupid=1}
*index_site_channel.scrub {regex||"Name"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"||}
*index_site_id.scrub {regex||"Value"\s*:\s*"([^"\\]*(?:\\.[^"\\]*)*)"||}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',PROVIDER_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_provider_end
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### CHANNELPROVIDER FILE CREATION (only to create the xxx-channel.xml file)
**
** @auto_xml_channelprovider_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
*url_index.modify {replace |##PROVIDER_ID##|'index_temp_1'}
*end_scope
*
*url_index{url|http://##COUNTRY##.yo.tv/settings/headend/##PROVIDER_ID##}
*index_site_channel.scrub {multi(separator=" " include="alt=""<h2>")|<div id="channelbar" >|<li>|</li>|</ul>}
*index_site_channel.modify {remove |alt=}
*index_site_channel.modify {remove |"}
*index_site_channel.modify {remove |<h2>}
*index_site_channel.modify {remove |</h2>}
*index_site_id.scrub{multi|<div id="content" >|<li id="|"}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_temp_2.modify {substring(type=regex)|'config_site_id' "PROVIDER_ID:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',PROVIDER_ID:'index_temp_2',CHANNEL_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_channelprovider_end
** _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
** ##### CHANNEL FILE CREATION (only to create the xxx-channel.xml file)
**
** @auto_xml_channel_start
*scope.range {(urlindex)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*url_index.modify {replace|##COUNTRY##|'index_temp_1'}
*end_scope
*
*url_index{url|http://##COUNTRY##.yo.tv/settings/headend/}
*index_site_channel.scrub {multi(separator=" " include="alt=""<h2>")|<div id="channelbar" >|<li>|</li>|</ul>}
*index_site_channel.modify {remove|alt=}
*index_site_channel.modify {remove|"}
*index_site_channel.modify {remove|<h2>}
*index_site_channel.modify {remove|</h2>}
*index_site_id.scrub{multi|<div id="content" >|<li id="|"}
*scope.range {(channellist)|end}
*index_temp_1.modify {substring(type=regex)|'config_site_id' "COUNTRY:([^,]*)"}
*index_site_id.modify {addstart|COUNTRY:'index_temp_1',CHANNEL_ID:}
*index_site_id.modify {cleanup(removeduplicates=equal,100 link="index_site_channel")}
*end_scope
** @auto_xml_channel_end
<channel site="yo.tv" site_id="COUNTRY:brasil,PROVIDER_ID:,CHANNEL_ID:12625" update="i" xmltv_id="TNT HD [Brazil]">TNT HD [Brazil]</channel>
I don't use mdb and season/episode works ok with me