ServerPortfolio  2.0
Python parsers and server
 All Classes Namespaces Files Functions Variables Properties Pages
Parser_Bourso.py
Go to the documentation of this file.
1 ## @package serverportfolio.Parsers.Parser_Bourso
2 # Specific parser for querying the french Boursorama website
3 #
4 # Last Changed : $Id: Parser_Bourso.py 13 2015-04-12 19:45:14Z michael $
5 
6 # August 2014, modify the parser with use of xpath, seems working for all stocks
7 # June 2014 Change again format, simple fix but better to do, better structure
8 #
9 # 12/ 12 / 2011 seen they change their format, no more t03 marks
10 #
11 # use of lxml, seems better for parsing
12 
13 import types, re, copy, logging
14 import datetime
15 from time import strftime, localtime
16 
17 from lxml import html, etree
18 
19 from serverportfolio import Utils
20 from serverportfolio.GlobalDicts import EAction
21 from serverportfolio.StockTemplates import StTmpl
22 from serverportfolio.PortfolioException import ParserError # PortfolioError
23 
24 from serverportfolio.Parsers.Abstract import AbstractParser
25 
26 ## @class Parser_Bourso
27 # @brief Specific parser for querying the Boursorama website, only for instantaneous data.
28 # Parse HTML Page of Boursorama: last value, volume, highest, lowest of the day, OPEN or CLOSED state...
30 
31  ## @brief Make the correspondence between the html page and the template InstValue.
32  # 0/1, could use for a exact matching or contains function
33  dict_record={
34  'value': ['Cours'], #,0/1
35  'variation': ['Variation'],
36  'dernier echange': ['Dernier'],
37  'volume': ['Volume '],
38  'ouverture': ['Ouverture'],
39  'plushaut': ['+ Haut'],
40  'plusbas' : ['+ Bas']
41  }
42 
43  ## @brief url format for boursorama, to complete with code_bourso
44  __INSTVALUE_STOCK_URL="http://www.boursorama.com/cours.phtml?symbole="
45 
46  ## @brief Constructor.
47  # @param e_action EAction to perform, only InstValue with Boursorama
48  def __init__(self, action ):
49  AbstractParser.__init__(self, action )
50  self.logger = logging.getLogger('SP.Parser_Bourso')
51 
52  self.source = "BOURSO"
53 
54  ## @brief Create the url with the code of the stock.
55  # Assert that only one Stock is provided in input (Parser dependent)
56  def create_url( self ):
57  self.logger.debug("create_url()")
58  self.logger.debug("local stock: %s", self.local_stock)
59 
60  assert ( len(self.local_stock) == 1 ), \
61  "Parser_Bourso deals only with one Stock at a time, nb: %d" % len(self.local_stock)
62  # extract symbol and stock from data member
63  stock_symbol = self.local_stock.keys()[0]
64  stock_obj = self.local_stock[ stock_symbol ]
65 
66  try :
67  self.url = self.__INSTVALUE_STOCK_URL + stock_obj.get_action('Static','code_bourso')
68  self.logger.info("url: %s" % self.url)
69 
70  except Exception as ex:
71  self.logger.error('Caught Exception in create_url: %s', ex)
72  raise ParserError( "Cannot create url", stock_symbol, self.e_action.name, None )
73 
74  ## @brief Parse the html page, specific to instantaneous values.
75  # Extract data from the complete html page to self.list_return_data (list not created, only one stock)\n
76  # @param html_page the html page returned by UtilsParsers.web_query
77  def parse(self, html_page ):
78  self.logger.debug("entry parse")
79  #print "html_page ", html_page
80 
81  # list_return_data set to None in init and clean, creates a first entry
82  self.list_return_data = StTmpl.get_template_parser( self.e_action )
83  # set the standard template with symbol
84  self.list_return_data['symbol'] = self.local_stock.keys()[0]
85 
86  # get specific template for InstValue
87  #self.list_return_data['action_templ'][ self.e_action.name ] = StTmpl.get_template( self.e_action )
88  self.logger.debug("list_return_data action template: %s" % self.list_return_data)
89 
90  # temporary link
91  tmp_templ = self.list_return_data['action_templ'][ self.e_action.name ]
92 
93  # parser the page
94  htmltree = html.document_fromstring( html_page )
95 
96  # extract table of values, only one present normally, do not throw exception
97  table_tmp_extracted = htmltree.xpath("//table[@class='info-valeur list']")
98  # some gives 0 !
99  #print "len html_page ", len(html_page)
100  #print "len tmp_extracted ", len(table_tmp_extracted)
101 
102  # GSZ shows 2 table, one for CDS
103  if len(table_tmp_extracted) > 0:
104  table_extracted = table_tmp_extracted[0]
105 
106  # see error with EURUS
107  # throw exception can skip the other function, to check...
108  else:
109  self.logger.error("Could not extract the table from the html page, url, self.url %s", self.url)
110  #self.list_return_data['InstValue']['error'] = 'BOURSO'
111  # need to save the page to see what s going on
112  self.logger.error("full html page:\n%s" % html_page)
113  self.logger.error("list_return_data with error: %s" % self.list_return_data)
114  raise ParserError("Could not extract the table from the html page", \
115  self.list_return_data['symbol'], self.e_action.name, self.url)
116 
117  # extract the data, fill the template
118  for name in self.dict_record.keys():
119  #print "will call extract_data name of the key ", name
120  try:
121  self._extract_data( name, table_extracted, tmp_templ )
122 
123  except Exception as ex:
124  self.logger.debug("Catch Exception in _extract_data: %s" % ex)
125  raise ParserError(ex, self.local_stock.keys()[0], self.e_action.name, self.url)
126 
127  ## @brief Extract the instantaneous values from the main lxml etree table.
128  # @param name of the data to extract, defined in self.dict_record
129  # @param table etree table containing all data
130  # @param dict_inst_value template to fill
131  def _extract_data( self, name, table, dict_inst_value ):
132  #print "entry _extract_data"
133  #print "name ", name
134  #print "table ", table, table.tag
135 
136  # link to the Stock symbol and object
137  stock_symbol = self.local_stock.keys()[0]
138  stock_obj = self.local_stock [ stock_symbol ]
139 
140  # check for specific implementation
141  if name == "dernier echange":
142 
143  # EURUS, 'Dernier echange' is not present, no alternative, use the actual date and time
144  if stock_obj.get_market() == 'devise':
145  dict_inst_value['date'] = strftime("%Y-%m-%d", localtime() )
146  dict_inst_value['time'] = strftime("%H:%M:%S", localtime() )
147 
148  # other present
149  else :
150  ( date, time ) = self._extract_data_dernier( table )
151  #print "alternative date/time ", date, time
152  dict_inst_value['date'] = date
153  dict_inst_value['time'] = time
154  #print "dict_data date ", dict_inst_value['date']
155  #print "dict_data time ", dict_inst_value['time']
156 
157  # if value, determine also state: CLOSED or OPEN
158  elif name == "value":
159 
160  (value, state) = self._extract_data_value( table )
161  #print "alternative value/state ", value, state
162  dict_inst_value['value'] = value
163  dict_inst_value['state'] = state
164  #print "dict_data value ", dict_inst_value['value']
165  #print "dict_data state ", dict_inst_value['state']
166 
167  # all other cases call generic function
168  else:
169  tmp_data = self._extract_data_generic( name, table )
170  dict_inst_value[name] = tmp_data
171  #print "dict_inst_value[ %s ] = %s " % (name,dict_inst_value[name])
172 
173  ## @brief Generic version to parse values, extract the last text() from a td entry.
174  # Volume not present with devise, throw an exception and fill with default 0, nan (??)\n
175  # @param name of the td entry to extract
176  # @param table of instantaneous values extracted by xpath
177  # @return value extracted
178  def _extract_data_generic(self, name, table):
179  #print "entry _extract_data_generic "
180  # build xpath_query
181  query = ".//*[text()='%s']" % ( self.dict_record[name][0] )
182  #print "query ", query
183 
184  # default value
185  tmp_data = 0.
186 
187  # get exception in reading Volume in EURUS
188  try :
189  # can set tmp_data to undefined, out of range here, avoid exception but still 0
190  source = table.xpath( query ) #[0]
191  # need the test, if not found, if exception anyway not needed (sometimes no exception but empty)
192  #print 'source ', source
193  if source:
194  source = source[0]
195  # second query get the text from the first parent, last td
196  tmp_data = source.xpath("..//td[last()]//text()")
197  #print tmp_data
198  tmp_data = self._extract_number( tmp_data[0] )
199 
200  # may report exception, volume and devise, let default 0 or use nan ?
201  except Exception as ex:
202  print "got exception in _extract_data_generic ex:", ex
203  self.logger.error("Exception in reading Volume of EURUS (at least)")
204 
205  #print "tmp_data ", tmp_data
206  return tmp_data
207 
208  ## @brief General function to extract all numbers ( price, volume...)
209  # @param line string in input
210  # @return the corrected string
211  def _extract_number ( self, line ):
212 
213  if ('ND' in line): # | (line.strip() == '-'):
214  #print "Not a number" or 0 ??
215  return float('nan')
216  # try to extract any number from the entry
217  return float(''.join(re.findall(r'([\d+\s\.\+\-])',line)).replace(' ',''))
218 
219  ## @brief Specific to extract the value and the state of the stock (OPEN/CLOSED).
220  # @param table of instantaneous values extracted by xpath
221  # @return tuple(value, state)
222  def _extract_data_value(self, table):
223 
224  cours = table.xpath(".//*[text()='Cours']")
225  # most generic, but return many /t/n in the list
226  str_value = cours[0].xpath("../..//td[last()]//text()")
227 
228  # cac40: 11.505 (c) Pts
229  # DJ: 17 081.52 Pts
230  # here str_value seems to have always 4 elements, the 3trd with data
231  #print "str_value ", str_value
232  #print "str_value[2]", str_value[2]
233  value = self._extract_number( str_value[2] )
234  #print "value", value
235 
236  no = str_value[2].find( '(c)')
237  if no >= 0:
238  state = 'CLOSED'
239  else:
240  state = 'OPEN'
241 
242  return ( value, state )
243 
244  ## @brief Specific to get the date and time of the dernier echange.
245  # @param table of instantaneous values extracted by xpath
246  # @return tupe(date, time) in string format
247  def _extract_data_dernier( self, table ):
248 
249  #print "entry _extract_data_dernier "
250 
251  # here crash with EURUS, exception not working, "Cours Forex 1"
252  dernier = table.xpath(".//*[contains(text(),'Dernier')]")
253  #print "dernier ", dernier
254  # here dernier maybe empty dernier[0], crash
255  #if len(dernier) == 1: # avoid crash, but appear later (should get correct exception!)
256 
257  tmp_datetime = dernier[0].xpath("../td[last()]//text()")
258 
259  # format tmp_datetime = [u'28/08/14\xa0', '17:37:15']
260  # or in one string
261  # print "tmp_datetime ", tmp_datetime
262 
263  #DJ one string, now CAC40 also one string
264  if len(tmp_datetime) == 1:
265  #print "len== 1 tmp_datetime[0] ", tmp_datetime[0]
266  #print "tmp_date[1] ", tmp_datetime[1]
267  text_ascii = tmp_datetime[0].encode('ascii','replace')
268  #print "text_ascii ", text_ascii
269  tab_datetime = text_ascii.rsplit('?')
270  date_tmp=datetime.datetime.strptime( tab_datetime[0], "%d/%m/%y" )
271  time_tmp=tab_datetime[1]
272 
273  # CAC40, splitted in 2: format tmp_datetime = [u'28/08/14\xa0', '17:37:15']
274  # Not anymore, can delete ?
275  elif len(tmp_datetime) == 2:
276  # correct encoding
277  text_ascii = tmp_datetime[0].encode('ascii','replace')[:-1]
278  #print "text_ascii ", text_ascii
279  date_tmp=datetime.datetime.strptime( text_ascii, "%d/%m/%y" )
280  time_tmp=tmp_datetime[1]
281 
282  # error, critical or not ?
283  else :
284  #print "ERROR len(tmp_datetime) ", len(tmp_datetime)
285  self.logger.error("_extract_data_dernier tmp_datetime %s" % tmp_datetime )
286  #raise ParserError() better
287  raise ParserError('Cannot extract date of the dernier echange',
288  self.local_stock.keys()[0], self.e_action.name, self.url)
289 
290 
291  return ( date_tmp.strftime("%Y-%m-%d"), time_tmp )
292 
293 # ## Not used, keep for idea, volume, euro/dollar
294 # # format data from string to a correct definiton
295 # # usable by the calling function
296 # def format_data(self):
297 #
298 #
299 # print 'format_data should NOT be used anymore !'
300 # print self.dict_data['value']
301 #
302 # no=self.dict_data['value'].find('(c)')
303 # if no >= 0:
304 # self.dict_data['state']='CLOSED'
305 # else:
306 # self.dict_data['state']='OPEN'
307 #
308 # self.dict_data['value']=extract_number(self.dict_data['value'])
309 # self.dict_data['variation']=extract_number(self.dict_data['variation'])
310 # self.dict_data['plusbas']=extract_number(self.dict_data['plusbas'])
311 # self.dict_data['plushaut']=extract_number(self.dict_data['plushaut'])
312 # self.dict_data['ouverture']=extract_number(self.dict_data['ouverture'])
313 #
314 # ##for volume, take out
315 # line=self.dict_data['volume']
316 # if line!='0':
317 # if 'M' in line:
318 # unit=1e6
319 # else:
320 # unit=1
321 # self.dict_data['volume']=unit*extract_number(line[:no])
322 # else:
323 # self.dict_data['volume']=0
324 
325 # for testing/debugging
326 # in specific parsers, the function run_parser does not exist (only in ParserStocks)
327 # call each function independently
328 if __name__ == "__main__":
329 
330  from serverportfolio.DictionaryStocks import DictionaryStocks
331  import UtilsParsers
332  import sys, time
333 
334  logging.basicConfig(level=logging.DEBUG)
335  logger = logging.getLogger('SP')
336  main_logger = logging.getLogger("SP.main")
337 
338  try:
339  stock = sys.argv[1]
340  action = sys.argv[2]
341  except IndexError:
342  print " Only for debugging, should call ./Run_Parsers for all options "
343  print " Usage:"
344  print " Parser_Bourso CAC40 InstValue"
345  print " Action: InstValue only implemented"
346  sys.exit(1)
347 
348  # works only with one Stock (test threads later), but expects a list
349  list_stock = []
350  list_stock.append(stock)
351  parser = Parser_Bourso( EAction.InstValue )
352 
353  # defined in AbstractParser, expect a list
354  print "\n== store copy of object in parser"
355  try :
356  parser.store_stock_copy( Utils.to_list(stock) )
357  except :
358  print "Caught error in store_stock_copy"
359  raise
360 
361  print "\n== create_url"
362  try:
363  parser.create_url()
364  except Exception as ex:
365  print "Catch Exception ", ex
366  sys.exit(1)
367 
368  print "\n== web_query "
369  try:
370  s = UtilsParsers.web_query( parser.url )
371  # raise urllibERROR at the moment
372  except Exception as ex:
373  print "Got exception from web_query: ", ex
374  sys.exit(1)
375 
376  print "\n== parse the web page"
377  try:
378  parser.parse( s )
379  except ParserError as ex:
380  print "Main catch PaserError ", ex
381  print ex.get_format_string()
382  sys.exit(1)
383  except Exception as ex:
384  print "Catch standard exception ", ex
385 
386  print "\n== update the stock(s)"
387  parser.update_stock()
388 
389  print Utils.pretty_dict( DictionaryStocks().get_stocks( list_stock ).get_action() )
390 
391  print "\n== sleep 5, modify state and second update "
392  time.sleep(5)
393  stock = DictionaryStocks().get_stocks( list_stock )
394  stock.set_action('InstValue', 'state', 'OPEN')
395 
396  try :
397  parser.run_parser( list_stock )
398  except PortfolioError as ex:
399  print "Caught PortfolioEorror"
400  print "ex:", ex.get_format_string()
401  sys.exit(1)
402  except Exception as ex:
403  print "Catch standard exception ", ex
404 
405  print Utils.pretty_dict( DictionaryStocks().get_stocks( list_stock ).get_action() )
406 
e_action
enumeration (EAction) of the type of query to perform
Definition: Abstract.py:40
dictionary dict_record
Make the correspondence between the html page and the template InstValue.
Define the global variable StockTemplates.StTmpl and dictionary templates.
def parse
Parse the html page, specific to instantaneous values.
def _extract_data_value
Specific to extract the value and the state of the stock (OPEN/CLOSED).
Define 2 abstract methods which need to be overridden by the Parsers and a generic algorithm (run_pa...
Definition: Abstract.py:29
def create_url
Create the url with the code of the stock.
def _extract_data_generic
Generic version to parse values, extract the last text() from a td entry.
Define custom and specific exceptions for the complete package.
Derived class specific to the parsers.
Define an abstract base class for specific Parsers.
Definition: Abstract.py:1
string __INSTVALUE_STOCK_URL
url format for boursorama, to complete with code_bourso
def _extract_number
General function to extract all numbers ( price, volume...)
Container of all Stocks objects, it also reads the static stocks configuration file "dictstocks...
Global variables for configuration: paths, TCP ports and generic definitions.
Definition: GlobalDicts.py:1
url
save url, useful for reporting errors and exceptions
Definition: Abstract.py:37
def _extract_data_dernier
Specific to get the date and time of the dernier echange.
def _extract_data
Extract the instantaneous values from the main lxml etree table.
Define singleton class DictionaryStocks, act as the main container of Stocks objects.
Specific parser for querying the Boursorama website, only for instantaneous data. ...