11'use strict' ;
22
33const Utils = require ( './utils' ) ;
4+ const Unraw = require ( 'unraw' ) . default ;
5+ const Cheerio = require ( 'cheerio' ) ;
46const Constants = require ( './constants' ) ;
57const NormalizeText = require ( 'replace-special-characters' ) ;
6-
8+
79class Parser {
8- constructor ( $ , raw_data ) {
9- this . $ = $ ;
10- this . raw_data = raw_data ;
10+ constructor ( data ) {
11+ this . data = data ;
12+ this . $ = Cheerio . load ( Utils . refineData ( data ) ) ;
1113 }
1214
1315 getOrganicResults ( ) {
@@ -23,7 +25,8 @@ class Parser {
2325 return this . $ ( el ) . text ( ) . trim ( ) ;
2426 } ) . get ( ) ;
2527
26- const urls = this . $ ( Constants . SELECTORS . URL ) . map ( ( i , el ) => this . $ ( el ) . attr ( 'href' ) ) . get ( ) ;
28+ const urls = this . $ ( Constants . SELECTORS . URL )
29+ . map ( ( i , el ) => this . $ ( el ) . attr ( 'href' ) ) . get ( ) ;
2730
2831 this . #correctFuzzyData( titles , descriptions , urls ) ;
2932
@@ -49,7 +52,7 @@ class Parser {
4952 this . $ ( Constants . SELECTORS . KNO_PANEL_METADATA ) . each ( ( i , el ) => {
5053 const key = this . $ ( el ) . first ( ) . text ( ) . trim ( ) . slice ( 0 , - 1 ) ;
5154 const value = this . $ ( el ) . next ( ) . text ( ) . trim ( ) ;
52- value . length && ( knowledge_panel [ NormalizeText ( key . toLowerCase ( ) . replace ( / / g, '_' ) ) ] = value . trim ( ) ) ;
55+ value . length && ( knowledge_panel [ NormalizeText ( key . toLowerCase ( ) . replace ( / / g, '_' ) . replace ( / \( | \) / g , '' ) ) ] = value . trim ( ) ) ;
5356 } ) ;
5457
5558 const knowledge_panel_type = this . $ ( Constants . SELECTORS . KNO_PANEL_TYPE ) . last ( ) . text ( ) ;
@@ -79,7 +82,7 @@ class Parser {
7982 . replace ( / < \/ s p a n > < \/ d i v > < d i v j s n a m e = " u 8 s 5 s f " c l a s s = " u j u d u b " > < s p a n j s n a m e = " y s 0 1 g e " > / g, '\n\n' )
8083 . replace ( / < b r > / g, '\n' ) ) . text ( ) ) . get ( ) ;
8184
82- song_lyrics . length > 0 && ( knowledge_panel . lyrics = song_lyrics . join ( '\n\n' ) ) ;
85+ song_lyrics . length && ( knowledge_panel . lyrics = song_lyrics . join ( '\n\n' ) ) ;
8386
8487 const google_users_rating = this . $ ( Constants . SELECTORS . KNO_PANEL_FILM_GOOGLEUSERS_RATING ) [ 0 ] ;
8588 if ( google_users_rating ) {
@@ -106,20 +109,20 @@ class Parser {
106109 knowledge_panel . images = this . $ ( Constants . SELECTORS . KNO_PANEL_IMAGES ) . map ( ( i , elem ) => {
107110 return {
108111 url : this . $ ( elem ) . attr ( 'data-src' ) ,
109- source : this . $ ( elem ) . parent ( ) . parent ( ) . parent ( ) . attr ( 'data-lpage' ) ,
112+ source : this . $ ( elem ) . parent ( ) . parent ( ) . parent ( ) . parent ( ) . attr ( 'data-lpage' ) ,
110113 } ;
111- } ) . get ( ) . filter ( ( img ) => img . url !== undefined ) ;
114+ } ) . get ( ) . filter ( ( img ) => img . url ) ;
112115
113- const demo = Utils . getStringBetweenStrings ( this . raw_data , 'source src\\x3d\\x22' , '.mp4' ) ;
116+ const demo = Utils . getStringBetweenStrings ( this . data , 'source src\\x3d\\x22' , '.mp4' ) ;
114117 demo && ( knowledge_panel . demonstration = demo + '.mp4' ) ;
115118
116- knowledge_panel . books . length == 0 &&
119+ ! knowledge_panel . books . length &&
117120 delete knowledge_panel . books ;
118- knowledge_panel . tv_shows_and_movies . length == 0 &&
121+ ! knowledge_panel . tv_shows_and_movies . length &&
119122 delete knowledge_panel . tv_shows_and_movies ;
120- knowledge_panel . available_on . length == 0 &&
123+ ! knowledge_panel . available_on . length &&
121124 delete knowledge_panel . available_on ;
122- knowledge_panel . images . length == 0 &&
125+ ! knowledge_panel . images . length &&
123126 delete knowledge_panel . images ;
124127
125128 return knowledge_panel ;
@@ -149,7 +152,7 @@ class Parser {
149152 } else {
150153 return undefined ;
151154 }
152- } ) . filter ( text => text != undefined && text . length != 0 ) [ 0 ] ;
155+ } ) . filter ( text => text && text . length ) [ 0 ] ;
153156
154157 return {
155158 title : featured_snippet_title || 'N/A' ,
@@ -158,42 +161,28 @@ class Parser {
158161 } ;
159162 }
160163
164+ getDidYouMean ( ) {
165+ return this . $ ( Constants . SELECTORS . DID_YOU_MEAN ) . text ( ) ;
166+ }
167+
161168 getTopStories ( ) {
162169 // Removes unnecessary text from the description
163- this . $ ( `${ Constants . SELECTORS . TOP_STORIES_DESCRIPTION [ 0 ] } > div.CEMjEf` ) . each ( ( i , el ) => this . $ ( el ) . remove ( ) ) ;
164- this . $ ( `${ Constants . SELECTORS . TOP_STORIES_DESCRIPTION [ 0 ] } > div > p` ) . each ( ( i , el ) => this . $ ( el ) . remove ( ) ) ;
170+ this . $ ( `${ Constants . SELECTORS . TOP_STORIES_DESCRIPTION [ 0 ] } > div.CEMjEf` ) . each ( ( el ) => this . $ ( el ) . remove ( ) ) ;
171+ this . $ ( `${ Constants . SELECTORS . TOP_STORIES_DESCRIPTION [ 0 ] } > div > p` ) . each ( ( el ) => this . $ ( el ) . remove ( ) ) ;
165172
166173 const top_stories_descriptions = Constants . SELECTORS . TOP_STORIES_DESCRIPTION . map ( ( selector ) =>
167- this . $ ( selector ) . map ( ( i , el ) => this . $ ( el ) . text ( ) . slice ( 1 ) ) . get ( ) ) . filter ( ( descs ) => descs . length > 0 ) [ 0 ] ;
168- const top_stories_urls = this . $ ( Constants . SELECTORS . TOP_STORIES_URL ) . map ( ( i , el ) => this . $ ( el ) . attr ( 'href' ) ) . get ( ) ;
174+ this . $ ( selector ) . map ( ( el ) => this . $ ( el ) . text ( ) ) . get ( ) ) . filter ( ( descs ) => descs . length > 0 ) [ 0 ] ;
175+ const top_stories_urls = this . $ ( Constants . SELECTORS . TOP_STORIES_URL ) . map ( ( el ) => this . $ ( el ) . attr ( 'href' ) ) . get ( ) ;
169176
170177 return top_stories_urls . map ( ( item , i ) => {
171178 if ( ! top_stories_descriptions ) return ;
172179 return {
173180 description : top_stories_descriptions [ i ] ,
174- url : item ,
181+ url : item
175182 } ;
176183 } ) . filter ( ( story ) => story ) ;
177184 }
178185
179- getPaa ( ) {
180- let people_also_ask = [ ] ;
181- Constants . SELECTORS . PAA . forEach ( ( item ) =>
182- this . $ ( item ) . each ( ( i , el ) => people_also_ask . push ( this . $ ( el ) . text ( ) ) ) ) ;
183- people_also_ask . shift ( ) ;
184- return people_also_ask ;
185- }
186-
187- getPas ( ) {
188- return this . $ ( Constants . SELECTORS . PASF ) . map ( ( i , el ) => {
189- if ( ! this . $ ( el ) . attr ( 'data-src' ) ) return ;
190- return {
191- title : this . $ ( el ) . attr ( 'alt' ) ,
192- thumbnail : `https:${ this . $ ( el ) . attr ( 'data-src' ) } `
193- } ;
194- } ) . get ( ) ;
195- }
196-
197186 getTime ( ) {
198187 const hours = this . $ ( Constants . SELECTORS . CURRENT_TIME_HOUR ) . text ( ) ;
199188 const date = this . $ ( Constants . SELECTORS . CURRENT_TIME_DATE ) . map ( ( i , el ) => this . $ ( el ) . text ( ) ) . get ( ) [ 1 ] ;
@@ -303,15 +292,44 @@ class Parser {
303292 }
304293 }
305294
295+ getPaa ( ) {
296+ const people_also_ask = [ ] ;
297+
298+ Constants . SELECTORS . PAA . forEach ( ( item ) =>
299+ this . $ ( item ) . each ( ( i , el ) => people_also_ask . push ( this . $ ( el ) . text ( ) ) ) ) ;
300+
301+ people_also_ask . shift ( ) ;
302+
303+ const extra_data = JSON . parse ( Unraw ( Utils . getStringBetweenStrings ( this . data , 'var c=\'' , '\';google' ) || '{}' ) ) ;
304+ const rfs = extra_data ?. sb_wiz ?. rfs ;
305+
306+ rfs && rfs . forEach ( ( el ) => {
307+ const item = el . replace ( / < b > | < \/ b > / g, '' ) ;
308+ people_also_ask . push ( item ) ;
309+ } ) ;
310+
311+ return people_also_ask ;
312+ }
313+
314+ getPas ( ) {
315+ return this . $ ( Constants . SELECTORS . PASF ) . map ( ( i , el ) => {
316+ if ( ! this . $ ( el ) . attr ( 'data-src' ) ) return ;
317+ return {
318+ title : this . $ ( el ) . attr ( 'alt' ) ,
319+ thumbnail : `https:${ this . $ ( el ) . attr ( 'data-src' ) } `
320+ } ;
321+ } ) . get ( ) ;
322+ }
323+
306324 #correctFuzzyData( titles , descriptions , urls ) {
307325 titles . length < urls . length && titles . length < descriptions . length && urls . shift ( ) ;
308326 urls . length > titles . length && urls . shift ( ) ;
309327
310- const innacurate_data = descriptions . length > urls . slice ( 1 ) . length ? false : true ;
311-
328+ const is_innacurate_data = descriptions . length < urls . slice ( 1 ) . length ;
329+
312330 urls . forEach ( ( item , index ) => {
313331 // Why YouTube? Because video results usually don't have a description.
314- if ( item . includes ( 'm.youtube.com' ) && innacurate_data && Constants . URLS . length > 1 ) {
332+ if ( item . includes ( 'm.youtube.com' ) && is_innacurate_data ) {
315333 urls . splice ( index , 1 ) ;
316334 titles . splice ( index , 1 ) ;
317335 index -- ;
0 commit comments