@@ -4,16 +4,28 @@
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
"""
# pylint: disable=too-many-branches
# pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING
import datetime
import re
from urllib . parse import urlencode , urlparse , parse_qs
import uuid
from urllib . parse import urlencode
from lxml import html
from searx . utils import eval_xpath , extract_text , eval_xpath_list , match_language , eval_xpath_getindex
from searx . network import multi_requests , Request
import babel
import babel . languages
from searx . utils import eval_xpath , extract_text , eval_xpath_list , eval_xpath_getindex
from searx import network
from searx . locales import language_tag , region_tag
from searx . enginelib . traits import EngineTraits
if TYPE_CHECKING :
import logging
logger : logging . Logger
traits : EngineTraits
about = {
@@ -25,56 +37,124 @@ about = {
" results " : ' HTML ' ,
}
send_accept_language_header = True
""" Bing tries to guess user ' s language and territory from the HTTP
Accept-Language. Optional the user can select a search-language (can be
different to the UI language) and a region (market code). """
# engine dependent config
categories = [ ' general ' , ' web ' ]
paging = True
time_range_support = Fals e
safesearch = Fals e
send_accept_language_header = True
supported_languages_url = ' https://www.bing.com/account/general '
language_aliases = { }
time_range_support = Tru e
safesearch = Tru e
safesearch_types = { 2 : ' STRICT ' , 1 : ' DEMOTE ' , 0 : ' OFF ' } # cookie: ADLT=STRICT
# search-url
base_url = ' https://www.bing.com/ '
base_url = ' https://www.bing.com/ search'
""" Bing (Web) search URL """
# initial query: https://www.bing.com/search?q=foo&search=&form=QBLH
inital_query = ' search? {query} &search=&form=QBLH '
# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
page_query = ' search? {query} &search=&first= {offset} &FORM=PERE '
bing_traits_url = ' https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes '
""" Bing (Web) search API description """
def _get_offset_from_pageno ( pageno ) :
return ( pageno - 1 ) * 10 + 1
def set_bing_cookies ( params , engine_language , engine_region , SID ) :
# set cookies
# -----------
params [ ' cookies ' ] [ ' _EDGE_V ' ] = ' 1 '
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
_EDGE_S = [
' F=1 ' ,
' SID= %s ' % SID ,
' mkt= %s ' % engine_region . lower ( ) ,
' ui= %s ' % engine_language . lower ( ) ,
]
params [ ' cookies ' ] [ ' _EDGE_S ' ] = ' & ' . join ( _EDGE_S )
logger . debug ( " cookie _EDGE_S= %s " , params [ ' cookies ' ] [ ' _EDGE_S ' ] )
# "_EDGE_CD": "m=zh-tw",
_EDGE_CD = [ # pylint: disable=invalid-name
' m= %s ' % engine_region . lower ( ) , # search region: zh-cn
' u= %s ' % engine_language . lower ( ) , # UI: en-us
]
params [ ' cookies ' ] [ ' _EDGE_CD ' ] = ' & ' . join ( _EDGE_CD ) + ' ; '
logger . debug ( " cookie _EDGE_CD= %s " , params [ ' cookies ' ] [ ' _EDGE_CD ' ] )
SRCHHPGUSR = [ # pylint: disable=invalid-name
' SRCHLANG= %s ' % engine_language ,
# Trying to set ADLT cookie here seems not to have any effect, I assume
# there is some age verification by a cookie (and/or session ID) needed,
# to disable the SafeSearch.
' ADLT= %s ' % safesearch_types . get ( params [ ' safesearch ' ] , ' DEMOTE ' ) ,
]
params [ ' cookies ' ] [ ' SRCHHPGUSR ' ] = ' & ' . join ( SRCHHPGUSR )
logger . debug ( " cookie SRCHHPGUSR= %s " , params [ ' cookies ' ] [ ' SRCHHPGUSR ' ] )
def request ( query , params ) :
""" Assemble a Bing-Web request. """
offset = _get_offset_from_pageno ( params . get ( ' pageno ' , 1 ) )
engine_region = traits . get_region ( params [ ' searxng_locale ' ] , ' en-US ' )
engine_language = traits . get_language ( params [ ' searxng_locale ' ] , ' en ' )
# logger.debug("params['pageno'] --> %s", params.get('pageno') )
# logger.debug(" offset --> %s", offset )
SID = uuid . uuid1 ( ) . hex . upper ( )
CVID = uuid . uuid1 ( ) . hex . upper ( )
search_string = page_query
if offset == 1 :
search_string = inital_query
set_bing_cookies ( params , engine_language , engine_region , SID )
if params [ ' language ' ] == ' all ' :
lang = ' EN '
else :
lang = match_language ( params [ ' language ' ] , supported_languages , language_aliases )
# build URL query
# ---------------
query = ' language: {} {} ' . format ( lang . split ( ' - ' ) [ 0 ] . upper ( ) , query )
# query term
page = int ( params . get ( ' pageno ' , 1 ) )
query_params = {
# fmt: off
' q ' : query ,
' pq ' : query ,
' cvid ' : CVID ,
' qs ' : ' n ' ,
' sp ' : ' -1 '
# fmt: on
}
search_path = search_string . format ( query = urlencode ( { ' q ' : query } ) , offset = offset )
if offset > 1 :
referer = base_url + inital_query . format ( query = urlencode ( { ' q ' : query } ) )
# page
if page > 1 :
referer = base_url + ' ? ' + urlencode ( query_params )
params [ ' headers ' ] [ ' Referer ' ] = referer
logger . debug ( " headers.Referer --> %s " , referer )
params [ ' url ' ] = base_url + search_path
params [ ' headers ' ] [ ' Accept ' ] = ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 '
query_ params[ ' first ' ] = _get_offset_from_pageno ( page )
if page == 2 :
query_params [ ' FORM ' ] = ' PERE '
elif page > 2 :
query_params [ ' FORM ' ] = ' PERE %s ' % ( page - 2 )
filters = ' '
if params [ ' time_range ' ] :
query_params [ ' filt ' ] = ' custom '
if params [ ' time_range ' ] == ' day ' :
filters = ' ex1: " ez1 " '
elif params [ ' time_range ' ] == ' week ' :
filters = ' ex1: " ez2 " '
elif params [ ' time_range ' ] == ' month ' :
filters = ' ex1: " ez3 " '
elif params [ ' time_range ' ] == ' year ' :
epoch_1970 = datetime . date ( 1970 , 1 , 1 )
today_no = ( datetime . date . today ( ) - epoch_1970 ) . days
filters = ' ex1: " ez5_ %s _ %s " ' % ( today_no - 365 , today_no )
params [ ' url ' ] = base_url + ' ? ' + urlencode ( query_params )
if filters :
params [ ' url ' ] = params [ ' url ' ] + ' &filters= ' + filters
return params
@@ -111,7 +191,8 @@ def response(resp):
url_cite = extract_text ( eval_xpath ( result , ' .//div[@class= " b_attribution " ]/cite ' ) )
# Bing can shorten the URL either at the end or in the middle of the string
if (
url_cite . startswith ( ' https:// ' )
url_cite
and url_cite . startswith ( ' https:// ' )
and ' … ' not in url_cite
and ' ... ' not in url_cite
and ' › ' not in url_cite
@@ -131,9 +212,9 @@ def response(resp):
# resolve all Bing redirections in parallel
request_list = [
Request . get ( u , allow_redirects = False , headers = resp . search_params [ ' headers ' ] ) for u in url_to_resolve
network . Request. get ( u , allow_redirects = False , headers = resp . search_params [ ' headers ' ] ) for u in url_to_resolve
]
response_list = multi_requests ( request_list )
response_list = network . multi_requests( request_list )
for i , redirect_response in enumerate ( response_list ) :
if not isinstance ( redirect_response , Exception ) :
results [ url_to_resolve_index [ i ] ] [ ' url ' ] = redirect_response . headers [ ' location ' ]
@@ -161,94 +242,43 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages ( resp ) :
lang_tags = set ( )
dom = html . fromstring ( resp . text )
lang_links = eval_xpath ( dom , ' //div[@id= " language-section " ]//li ' )
for _li in lang_links :
href = eval_xpath ( _li , ' .//@href ' ) [ 0 ]
( _scheme , _netloc , _path , _params , query , _fragment ) = urlparse ( href )
query = parse_qs ( query , keep_blank_values = True )
# fmt: off
setlang = query . get ( ' setlang ' , [ None , ] ) [ 0 ]
# example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
lang , nation = ( setlang . split ( ' - ' , maxsplit = 1 ) + [ None , ] ) [ : 2 ] # fmt: skip
# fmt: on
tag = lang + ' - ' + nation if nation else lang
lang_tags . add ( tag )
return list ( lang_tags )
def fetch_traits ( engine_traits : EngineTraits ) :
""" Fetch languages and regions from b ing. """
""" Fetch languages and regions from B ing-Web . """
# pylint: disable=import-outside-toplevel, disable=too-many-branches,
# pylint: disable=too-many-locals, too-many-statements
xpath_market_codes = ' //table[1]/tbody/tr/td[3] '
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = ' //table[3]/tbody/tr/td[2] '
engine_traits . data_type = ' supported_languages ' # deprecated
_fetch_traits ( engine_traits , bing_traits_url , xpath_language_codes , xpath_market_codes )
import babel
import babel . languages
from searx import network
from searx . locales import get_offical_locales , language_tag , region_tag
from searx . utils import gen_useragent
headers = {
' User-Agent ' : gen_useragent ( ) ,
' Accept-Language ' : " en-US,en;q=0.5 " , # bing needs to set the English language
}
resp = network . get ( ' https://www.bing.com/account/general ' , headers = headers )
def _fetch_traits ( engine_traits : EngineTraits , url : str , xpath_language_codes : str , xpath_market_codes : str ) :
# insert alias to map from a language (zh) to a language + script (zh_Hans)
engine_traits . languages [ ' zh ' ] = ' zh-hans '
resp = network . get ( url )
if not resp . ok :
print ( " ERROR: response from peertube is not OK. " )
dom = html . fromstring ( resp . text )
# Selector to get items from "Display language"
map_lang = { ' jp ' : ' ja ' }
for td in eval_xpath ( dom , xpath_language_codes ) :
eng_lang = td . text
lang_map = {
' prs ' : ' fa ' , # Persian
' pt_BR ' : ' pt ' , # Portuguese (Brasil)
' pt_PT ' : ' pt ' , # Portuguese (Portugal)
' ca-ES-VALENCIA ' : ' ca ' , # Catalan (Spain, Valencian)
}
unknow_langs = [
' quc ' , # K'iche'
' nso ' , # Sesotho sa Leboa
' tn ' , # Setswana
]
for div in eval_xpath ( dom , ' //div[@id= " limit-languages " ]//input/.. ' ) :
eng_lang = eval_xpath ( div , ' .//input/@value ' ) [ 0 ]
if eng_lang in unknow_langs :
if eng_lang in ( ' en-gb ' , ' pt-br ' ) :
# language 'en' is already in the list and a language 'en-gb' can't
# be handled in SearXNG, same with pt-br which is covered by pt-pt.
continue
eng _lang = lang_map . get ( eng_lang , eng_lang )
label = extract_text ( eval_xpath ( div , ' .//label ' ) )
# The 'language:xx' query string in the request function (above) does
# only support the language codes from the "Display languages" list.
# Examples of items from the "Display languages" not sopported in the
# query string: zh_Hans --> zh / sr_latn --> sr
#
# eng_lang = eng_lang.split('_')[0]
babel _lang = map_ lang. get ( eng_lang , eng_lang ) . replace ( ' - ' , ' _ ' )
try :
sxng_tag = language_tag ( babel . Locale . parse ( eng_lang . replace ( ' - ' , ' _ ' ) , sep = ' _ ' ) )
sxng_tag = language_tag ( babel . Locale . parse ( babel_lang ) )
except babel . UnknownLocaleError :
print ( " ERROR: %s ( %s ) is unknown by babel " % ( label , eng_lang ) )
print ( " ERROR: language ( %s ) is unknown by babel " % ( eng_lang ) )
continue
conflict = engine_traits . languages . get ( sxng_tag )
if conflict :
if conflict != eng_lang :
@@ -256,25 +286,27 @@ def fetch_traits(engine_traits: EngineTraits):
continue
engine_traits . languages [ sxng_tag ] = eng_lang
engine_traits . languages [ ' zh ' ] = ' zh_Hans '
map_region = {
' en-ID ' : ' id_ID ' ,
' no-NO ' : ' nb_NO ' ,
}
# regiones
for td in eval_xpath ( dom , xpath_market_codes ) :
eng_region = td . text
babel_region = map_region . get ( eng_region , eng_region ) . replace ( ' - ' , ' _ ' )
for a in eval_xpath ( dom , ' //div[@id= " region-section-content " ]//li/a ' ) :
href = eval_xpath ( a , ' .//@href ' ) [ 0 ]
# lang_name = extract_text(a)
query = urlparse ( href ) [ 4 ]
query = parse_qs ( query , keep_blank_values = True )
cc = query . get ( ' cc ' ) [ 0 ] # pylint:disable=invalid-name
if cc == ' clear ' :
if eng_region == ' en-WW ' :
engine_traits . all_locale = eng_region
continue
# Assert babel supports this locales
sxng_locales = get_offical_locales ( cc . upper ( ) , engine_traits . languages . keys ( ) )
if not sxng_locales :
# print("ERROR: can't map from bing country %s (%s) to a babel region." % (a.text_content().strip(), cc))
try :
sxng_tag = region_tag ( babel . Locale . parse ( babel_region ) )
except babel . UnknownLocaleError :
print ( " ERROR: region ( %s ) is unknown by babel " % ( eng_region ) )
continue
for sxng_locale in sxng_locales :
engine_traits . regions [ region_tag ( sxng_locale ) ] = cc
conflict = engine_traits . regions . get ( sxng_tag )
if conflict :
if conflict != eng_region :
print ( " CONFLICT: babel %s --> %s , %s " % ( sxng_tag , conflict , eng_region ) )
continue
engine_traits . regions [ sxng_tag ] = eng_region