Skip to content

Commit bc96493

Browse files
committed
fix: parsing problems and other major bugs
1 parent 809d49e commit bc96493

File tree

9 files changed

+210
-162
lines changed

9 files changed

+210
-162
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ async function start() {
3434
}
3535
}
3636

37-
const response = await google.search("TWDG", options);
37+
const response = await google.search('TWDG', options);
3838
console.log(response);
3939
}
4040

@@ -215,7 +215,7 @@ start();
215215

216216
## What else can it do?
217217

218-
As you may have noticed, the library returns a lot of data. Currently it can parse everything from the knowledge graph, featured snippets and much more such as Google Dictionary, Google Translator and song lyrics.
218+
As you can see, the library returns a lot of data. Currently it can parse everything from the knowledge graph, featured snippets and much more such as Google Dictionary, Google Translate and song lyrics.
219219
All you have to do is search something along the lines of; ```“define xyz”```, ```“translate x to y”``` or ```“xyz song lyrics”``` and the appropriated fields will appear in the response.
220220

221221
#### Examples:

examples/index.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ async function start() {
1818
// Reverse Image Search
1919
const reverse = await google.search("https://i.pinimg.com/236x/92/16/d9/9216d9a222ef65eb6eabfff1970180d1.jpg", { ris: true });
2020
console.info('Reverse Image Search:', reverse.results);
21-
22-
// Top news
21+
2322
const news = await google.getTopNews();
2423
console.info('Google Top News:', news);
2524
}

lib/constants.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ module.exports = {
2929
KNO_PANEL_TYPE: 'div.BkwXh > div',
3030
KNO_PANEL_SONG_LYRICS: 'div.ujudUb',
3131
KNO_PANEL_AVAILABLE_ON: 'div[class="ellip bclEt"]',
32-
KNO_PANEL_IMAGES: 'div > g-scrolling-carousel > div > div > div > g-inner-card > g-img > img',
32+
KNO_PANEL_IMAGES: 'div > g-inner-card.xIfh4d > div > img',
3333
KNO_PANEL_BOOKS: 'div[data-attrid="kc:/book/author:books only"] > a > div > div > div.Bo9xMe > div',
3434
KNO_PANEL_TV_SHOWS_AND_MOVIES: 'div[data-attrid="kc:/people/person:tv-shows-and-movies"] > a > div > div > div.Bo9xMe > div',
3535
KNO_PANEL_FILM_GOOGLEUSERS_RATING: 'div[data-attrid="kc:/ugc:thumbs_up"] > div > div > div',
@@ -68,7 +68,7 @@ module.exports = {
6868

6969
// Google Dictionary
7070
GD_WORD: 'span[data-dobid="hdw"]',
71-
GD_PHONETIC: 'div[class="S23sjd"]',
71+
GD_PHONETIC: 'div.qexShd',
7272
GD_AUDIO: 'audio > source',
7373
GD_DEFINITIONS: 'div[data-dobid="dfn"]',
7474
GD_EXAMPLES: 'div[class="ubHt5c"]',
@@ -87,7 +87,7 @@ module.exports = {
8787
TOP_STORIES_WEBSITE: 'div[class="g5wfEd"] > div > g-img > img',
8888

8989
// “People also ask”
90-
PAA: [ 'div.s75CSd.u60jwe.gduDCb > span', 'div.wWOJcd > div > span', 'div.SC9Vz > div.zd9Fwc' ],
90+
PAA: [ 'div.s75CSd.u60jwe.gduDCb > span', 'div.gbCQS.u60jwe.gduDCb > div > span', 'div.JlqpRe > span' ],
9191

9292
// “People also search for”
9393
PASF: 'div[class="IHdOHf"] > img',

lib/googlethis.js

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,14 @@ const Cheerio = require('cheerio');
77
const Constants = require('./constants');
88

99
/**
10-
* Searches a given query on Google.
10+
* Search a given query on Google.
1111
*
12-
* @param {string} query Query.
13-
* @param {object} options Search options.
12+
* @param {string} query - Search query
13+
* @param {object} [options] Search options
14+
* @param {boolean} [options.ris] - Use reverse image search
15+
* @param {boolean} [options.safe] - Safe search
16+
* @param {number} [options.page] - Pagination
17+
* @param {object} [options.additional_params] - Parameters that will be passed to Google
1418
*/
1519
async function search(query, options = {}) {
1620
query = query.trim().split(/ +/).join('+');
@@ -27,10 +31,7 @@ async function search(query, options = {}) {
2731
'&start=' + options.page);
2832

2933
const response = await Axios.get(url, { params: options.additional_params, headers: Utils.getHeaders(true) }).catch((err) => err);
30-
if (response instanceof Error) throw new Error('Could not search on Google: ' + response.message);
31-
32-
const $ = Cheerio.load(Utils.refineData(response.data));
33-
const parser = new Parser($, response.data);
34+
if (response instanceof Error) throw new Utils.SearchError('Could not execute search', { status_code: response?.status || 0, message: response?.message });
3435

3536
const results = {
3637
results: [],
@@ -41,12 +42,14 @@ async function search(query, options = {}) {
4142
people_also_ask: [],
4243
people_also_search_for: []
4344
};
45+
46+
const parser = new Parser(response.data);
4447

4548
results.results = parser.getOrganicResults();
4649
results.knowledge_panel = parser.getKnowledgeGraph();
4750
results.featured_snippet = parser.getFeaturedSnippet();
4851

49-
const did_you_mean = $(Constants.SELECTORS.DID_YOU_MEAN).text();
52+
const did_you_mean = parser.getDidYouMean();
5053
did_you_mean && (results.did_you_mean = did_you_mean) || (delete results.did_you_mean);
5154

5255
const unit_converter = parser.getConverters();
@@ -80,10 +83,13 @@ async function search(query, options = {}) {
8083
}
8184

8285
/**
83-
* Searches images on Google.
86+
* Google image search.
8487
*
85-
* @param {string} query Search query.
86-
* @param {object} options Search options.
88+
* @param {string} query - Search query
89+
* @param {object} [options] - Search options
90+
* @param {boolean} [options.safe] - Safe search
91+
* @param {object} [options.additional_params] - Parameters that will be passed to Google
92+
* @param {Array.<string>} [options.exclude_domains] - Domains that should be blocked
8793
*/
8894
async function image(query, options = {}) {
8995
query = query.trim().split(/ +/).join('+');
@@ -97,7 +103,7 @@ async function image(query, options = {}) {
97103
' ' + options.exclude_domains.map((site) => '-site:' + site);
98104

99105
const response = await Axios.get(url, { params: options.additional_params, headers: Utils.getHeaders(false) }).catch((err) => err);
100-
if (response instanceof Error) throw new Error('Could not search on Google: ' + response.message);
106+
if (response instanceof Error) throw new Utils.SearchError('Could not execute search', { status_code: response?.status || 0, message: response?.message });
101107

102108
const results = [];
103109
const origin = parseImageOriginData(response.data);
@@ -123,7 +129,7 @@ async function image(query, options = {}) {
123129
/**
124130
* Gets image origin data
125131
*
126-
* @param {string} data Raw html.
132+
* @param {string} data - Raw html.
127133
*/
128134
function parseImageOriginData(data) {
129135
let results = [];
@@ -132,7 +138,7 @@ function parseImageOriginData(data) {
132138
while (parsed_results != null) {
133139
results.push({
134140
title: parsed_results[4],
135-
website: parsed_results[3],
141+
source: parsed_results[3],
136142
});
137143
parsed_results = Constants.REGEX.IMAGE_ORIGIN.exec(data);
138144
}

lib/parser.js

Lines changed: 59 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
'use strict';
22

33
const Utils = require('./utils');
4+
const Unraw = require('unraw').default;
5+
const Cheerio = require('cheerio');
46
const Constants = require('./constants');
57
const NormalizeText = require('replace-special-characters');
6-
8+
79
class Parser {
8-
constructor($, raw_data) {
9-
this.$ = $;
10-
this.raw_data = raw_data;
10+
constructor(data) {
11+
this.data = data;
12+
this.$ = Cheerio.load(Utils.refineData(data));
1113
}
1214

1315
getOrganicResults() {
@@ -23,7 +25,8 @@ class Parser {
2325
return this.$(el).text().trim();
2426
}).get();
2527

26-
const urls = this.$(Constants.SELECTORS.URL).map((i, el) => this.$(el).attr('href')).get();
28+
const urls = this.$(Constants.SELECTORS.URL)
29+
.map((i, el) => this.$(el).attr('href')).get();
2730

2831
this.#correctFuzzyData(titles, descriptions, urls);
2932

@@ -49,7 +52,7 @@ class Parser {
4952
this.$(Constants.SELECTORS.KNO_PANEL_METADATA).each((i, el) => {
5053
const key = this.$(el).first().text().trim().slice(0, -1);
5154
const value = this.$(el).next().text().trim();
52-
value.length && (knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_'))] = value.trim());
55+
value.length && (knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_').replace(/\(|\)/g, ''))] = value.trim());
5356
});
5457

5558
const knowledge_panel_type = this.$(Constants.SELECTORS.KNO_PANEL_TYPE).last().text();
@@ -79,7 +82,7 @@ class Parser {
7982
.replace(/<\/span><\/div><div jsname="u8s5sf" class="ujudub"><span jsname="ys01ge">/g, '\n\n')
8083
.replace(/<br>/g, '\n')).text()).get();
8184

82-
song_lyrics.length > 0 && (knowledge_panel.lyrics = song_lyrics.join('\n\n'));
85+
song_lyrics.length && (knowledge_panel.lyrics = song_lyrics.join('\n\n'));
8386

8487
const google_users_rating = this.$(Constants.SELECTORS.KNO_PANEL_FILM_GOOGLEUSERS_RATING)[0];
8588
if (google_users_rating) {
@@ -106,20 +109,20 @@ class Parser {
106109
knowledge_panel.images = this.$(Constants.SELECTORS.KNO_PANEL_IMAGES).map((i, elem) => {
107110
return {
108111
url: this.$(elem).attr('data-src'),
109-
source: this.$(elem).parent().parent().parent().attr('data-lpage'),
112+
source: this.$(elem).parent().parent().parent().parent().attr('data-lpage'),
110113
};
111-
}).get().filter((img) => img.url !== undefined);
114+
}).get().filter((img) => img.url);
112115

113-
const demo = Utils.getStringBetweenStrings(this.raw_data, 'source src\\x3d\\x22', '.mp4');
116+
const demo = Utils.getStringBetweenStrings(this.data, 'source src\\x3d\\x22', '.mp4');
114117
demo && (knowledge_panel.demonstration = demo + '.mp4');
115118

116-
knowledge_panel.books.length == 0 &&
119+
!knowledge_panel.books.length &&
117120
delete knowledge_panel.books;
118-
knowledge_panel.tv_shows_and_movies.length == 0 &&
121+
!knowledge_panel.tv_shows_and_movies.length &&
119122
delete knowledge_panel.tv_shows_and_movies;
120-
knowledge_panel.available_on.length == 0 &&
123+
!knowledge_panel.available_on.length &&
121124
delete knowledge_panel.available_on;
122-
knowledge_panel.images.length == 0 &&
125+
!knowledge_panel.images.length &&
123126
delete knowledge_panel.images;
124127

125128
return knowledge_panel;
@@ -149,7 +152,7 @@ class Parser {
149152
} else {
150153
return undefined;
151154
}
152-
}).filter(text => text != undefined && text.length != 0)[0];
155+
}).filter(text => text && text.length)[0];
153156

154157
return {
155158
title: featured_snippet_title || 'N/A',
@@ -158,42 +161,28 @@ class Parser {
158161
};
159162
}
160163

164+
getDidYouMean() {
165+
return this.$(Constants.SELECTORS.DID_YOU_MEAN).text();
166+
}
167+
161168
getTopStories() {
162169
// Removes unnecessary text from the description
163-
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((i, el) => this.$(el).remove());
164-
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((i, el) => this.$(el).remove());
170+
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((el) => this.$(el).remove());
171+
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((el) => this.$(el).remove());
165172

166173
const top_stories_descriptions = Constants.SELECTORS.TOP_STORIES_DESCRIPTION.map((selector) =>
167-
this.$(selector).map((i, el) => this.$(el).text().slice(1)).get()).filter((descs) => descs.length > 0)[0];
168-
const top_stories_urls = this.$(Constants.SELECTORS.TOP_STORIES_URL).map((i, el) => this.$(el).attr('href')).get();
174+
this.$(selector).map((el) => this.$(el).text()).get()).filter((descs) => descs.length > 0)[0];
175+
const top_stories_urls = this.$(Constants.SELECTORS.TOP_STORIES_URL).map((el) => this.$(el).attr('href')).get();
169176

170177
return top_stories_urls.map((item, i) => {
171178
if (!top_stories_descriptions) return;
172179
return {
173180
description: top_stories_descriptions[i],
174-
url: item,
181+
url: item
175182
};
176183
}).filter((story) => story);
177184
}
178185

179-
getPaa() {
180-
let people_also_ask = [];
181-
Constants.SELECTORS.PAA.forEach((item) =>
182-
this.$(item).each((i, el) => people_also_ask.push(this.$(el).text())));
183-
people_also_ask.shift();
184-
return people_also_ask;
185-
}
186-
187-
getPas() {
188-
return this.$(Constants.SELECTORS.PASF).map((i, el) => {
189-
if (!this.$(el).attr('data-src')) return;
190-
return {
191-
title: this.$(el).attr('alt'),
192-
thumbnail: `https:${this.$(el).attr('data-src')}`
193-
};
194-
}).get();
195-
}
196-
197186
getTime() {
198187
const hours = this.$(Constants.SELECTORS.CURRENT_TIME_HOUR).text();
199188
const date = this.$(Constants.SELECTORS.CURRENT_TIME_DATE).map((i, el) => this.$(el).text()).get()[1];
@@ -303,15 +292,44 @@ class Parser {
303292
}
304293
}
305294

295+
getPaa() {
296+
const people_also_ask = [];
297+
298+
Constants.SELECTORS.PAA.forEach((item) =>
299+
this.$(item).each((i, el) => people_also_ask.push(this.$(el).text())));
300+
301+
people_also_ask.shift();
302+
303+
const extra_data = JSON.parse(Unraw(Utils.getStringBetweenStrings(this.data, 'var c=\'', '\';google') || '{}'));
304+
const rfs = extra_data?.sb_wiz?.rfs;
305+
306+
rfs && rfs.forEach((el) => {
307+
const item = el.replace(/<b>|<\/b>/g, '');
308+
people_also_ask.push(item);
309+
});
310+
311+
return people_also_ask;
312+
}
313+
314+
getPas() {
315+
return this.$(Constants.SELECTORS.PASF).map((i, el) => {
316+
if (!this.$(el).attr('data-src')) return;
317+
return {
318+
title: this.$(el).attr('alt'),
319+
thumbnail: `https:${this.$(el).attr('data-src')}`
320+
};
321+
}).get();
322+
}
323+
306324
#correctFuzzyData(titles, descriptions, urls) {
307325
titles.length < urls.length && titles.length < descriptions.length && urls.shift();
308326
urls.length > titles.length && urls.shift();
309327

310-
const innacurate_data = descriptions.length > urls.slice(1).length ? false : true;
311-
328+
const is_innacurate_data = descriptions.length < urls.slice(1).length;
329+
312330
urls.forEach((item, index) => {
313331
// Why YouTube? Because video results usually don't have a description.
314-
if (item.includes('m.youtube.com') && innacurate_data && Constants.URLS.length > 1) {
332+
if (item.includes('m.youtube.com') && is_innacurate_data) {
315333
urls.splice(index, 1);
316334
titles.splice(index, 1);
317335
index--;

lib/utils.js

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
const UserAgent = require('user-agents');
44

5+
function SearchError(message, info) {
6+
this.info = info;
7+
this.stack = Error(message).stack;
8+
}
9+
10+
SearchError.prototype = Object.create(Error.prototype);
11+
SearchError.prototype.constructor = SearchError;
12+
513
/**
614
* Returns headers with a random user agent.
715
*
@@ -22,7 +30,7 @@ function getHeaders (is_mobile) {
2230
/**
2331
* Refines the html.
2432
*
25-
* @param {string} data Raw html data.
33+
* @param {string} data - Raw html data.
2634
* @returns {string} Refined data.
2735
*/
2836
function refineData (data) {
@@ -54,9 +62,9 @@ function refineData (data) {
5462
/**
5563
* Gets a string between two delimiters.
5664
*
57-
* @param {string} data The data.
58-
* @param {string} start_string Start string.
59-
* @param {string} end_string End string.
65+
* @param {string} data - The data.
66+
* @param {string} start_string - Start string.
67+
* @param {string} end_string - End string.
6068
*/
6169
function getStringBetweenStrings (data, start_string, end_string) {
6270
const regex = new RegExp(`${escapeStringRegexp(start_string)}(.*?)${escapeStringRegexp(end_string)}`, "s");
@@ -68,4 +76,4 @@ function escapeStringRegexp (string) {
6876
return string.replace(/[|\\{}()[\]^$+*?.]/g, '\\$&').replace(/-/g, '\\x2d');
6977
}
7078

71-
module.exports = { getHeaders, getStringBetweenStrings, refineData };
79+
module.exports = { SearchError, getHeaders, getStringBetweenStrings, refineData };

0 commit comments

Comments
 (0)