1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMTcxMi4wMDUwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2018-12-05T00:28:03.000Z","paperID":"1712.00504","published":"2017-11-30T16:31:36.000Z","authors":"[\"Rui Luo\",\"Weinan Zhang\",\"Xiaojun Xu\",\"Jun Wang\"]","title":"A Neural Stochastic Volatility Model","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2022-09-01T17:07:57.615Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9hLW5ldXJhbC1zdG9jaGFzdGljLXZvbGF0aWxpdHktbW9kZWwifQ==","type":"pwc","url":"https://paperswithcode.com/paper/a-neural-stochastic-volatility-model","data":null}],"reposConnection":{"edges":[{"official":null,"node":{"id":"eyJyZXBvSUQiOiIxMTgwNjE4NTMiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"118061853","url":"https://github.com/craha22/Qfin","title":"Qfin","language":"r","stars":0,"forks":0,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"craha22","avatar":"https://avatars.githubusercontent.com/u/20206967?v=4"}]}}]},"models":[],"tags":[{"id":"eyJuYW1lIjoidGltZSBzZXJpZXMgYW5hbHlzaXMiLCJ0eXBlIjoidGFzayJ9","name":"time series analysis","description":"Time series analysis involves inputting sequential data over time to predict future values or trends. It's commonly used in finance for stock price prediction, weather forecasting, and in any field where data is collected over time.","scoreTrending":0.12842231300996784,"count":{"stars":3368,"papers":1418,"models":1218},"__typename":"Tag"},{"id":"eyJuYW1lIjoidGltZSBzZXJpZXMiLCJ0eXBlIjoidGFzayJ9","name":"time series","description":"In time series forecasting, the input is a sequence of data points collected over time, and the output is a prediction of future data points. This method is commonly used in finance for stock price prediction, weather forecasting, and sales forecasting.","scoreTrending":0.12842231300996784,"count":{"stars":15122,"papers":7893,"models":5919},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[{"author":"rui luo","node":{"id":"eyJhZGRyZXNzIjoici5sdW9AY3MudWNsLmFjLnVrIn0=","address":"r.luo@cs.ucl.ac.uk","name":"R. Luo","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"University College London"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"5AnhSDQAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiJlNGYwZTBiNS0xNDQxLTRkNTQtYTEyOS0wNTA0MTdkYWQ2OWUifQ==","name":"rui luo","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTkwMS4wOTIwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1901.09207"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wMDUwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.00504"},{"id":"eyJwYXBlcklEIjoiMTYwNi4wNjkwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1606.06905"},{"id":"eyJwYXBlcklEIjoiMjMwNS4wNTIyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2305.05228"},{"id":"eyJwYXBlcklEIjoiMjEwOS4wOTgzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2109.09833"},{"id":"eyJwYXBlcklEIjoiMTkwOS4xMjkzOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1909.12939"},{"id":"eyJwYXBlcklEIjoiMTkwNS4xMjU2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.12569"},{"id":"eyJwYXBlcklEIjoiMjEwNS4wOTU5NyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2105.09597"}]}]}},{"author":"jun wang","node":{"id":"eyJhZGRyZXNzIjoiai53YW5nQGNzLnVjbC5hYy51ayJ9","address":"j.wang@cs.ucl.ac.uk","name":"Jun Wang","avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[{"name":"University College London"}],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"S0jdSPEAAAAJ"},{"thirdPartyID":"MIQItw4AAAAJ"},{"thirdPartyID":"wIE1tY4AAAAJ"},{"thirdPartyID":"K1FKF3IAAAAJ"}],"twitter":[],"location":[],"owner":[]}},{"author":"weinan zhang","node":{"id":"eyJhZGRyZXNzIjoid256aGFuZ0BhcGV4LnNqdHUuZWR1LmNuIn0=","address":"wnzhang@apex.sjtu.edu.cn","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[{"avatar":"https://avatars.githubusercontent.com/u/2649287?v=4","username":"wnzhang"}],"scholar":[{"thirdPartyID":"hqq3rCQAAAAJ"},{"thirdPartyID":"Qzss0GEAAAAJ"},{"thirdPartyID":"4kggyFIAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiIyYmJjNTA2OS04Y2VhLTQxMDgtOTBhNS04YjUwMDlmZmIxOGEifQ==","name":"weinan zhang","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTYwOS4wNTQ3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1609.05473"},{"id":"eyJwYXBlcklEIjoiMTcwOS4wODYyNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1709.08624"},{"id":"eyJwYXBlcklEIjoiMjAwMS4wOTM4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2001.09382"},{"id":"eyJwYXBlcklEIjoiMTgwMi4wMTg4NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1802.01886"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wMTIxNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.01217"},{"id":"eyJwYXBlcklEIjoiMTYxMS4wMDE0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1611.00144"},{"id":"eyJwYXBlcklEIjoiMTcwNy4wNDg3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1707.04873"},{"id":"eyJwYXBlcklEIjoiMTcwMS4wMjQ5MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1701.02490"},{"id":"eyJwYXBlcklEIjoiMTgwMy4wNzEzMyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1803.07133"},{"id":"eyJwYXBlcklEIjoiMTgxMS4wNTg2OSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1811.05869"},{"id":"eyJwYXBlcklEIjoiMTcwMi4wNjY3NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1702.06674"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wMDUwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.00504"},{"id":"eyJwYXBlcklEIjoiMTgwOC4wMzczNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1808.03737"},{"id":"eyJwYXBlcklEIjoiMjAwOC4wNzkwNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2008.07905"},{"id":"eyJwYXBlcklEIjoiMTkwMy4wMTM0NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1903.01344"},{"id":"eyJwYXBlcklEIjoiMTgwNC4wMzc4MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1804.03782"},{"id":"eyJwYXBlcklEIjoiMjEwMi4wMDcxNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.00714"},{"id":"eyJwYXBlcklEIjoiMjEwNC4xMDU4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.10584"},{"id":"eyJwYXBlcklEIjoiMTgwNC4wOTAyMSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1804.09021"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wMzAyOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.03028"},{"id":"eyJwYXBlcklEIjoiMTgwMy4wMjE5NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1803.02194"},{"id":"eyJwYXBlcklEIjoiMTkxMS4wMzg4MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1911.03883"},{"id":"eyJwYXBlcklEIjoiMTkwNS4wMDc1OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1905.00758"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wNjYzNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.06635"},{"id":"eyJwYXBlcklEIjoiMjExMS4wNTQwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.05407"},{"id":"eyJwYXBlcklEIjoiMjAxMC4wOTU0NiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.09546"},{"id":"eyJwYXBlcklEIjoiMjAxMi4wMzQyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2012.03420"},{"id":"eyJwYXBlcklEIjoiMjExMS4wODU1MCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2111.08550"},{"id":"eyJwYXBlcklEIjoiMjAxMC4xMTczNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2010.11737"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wMjAwOSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.02009"},{"id":"eyJwYXBlcklEIjoiMTgwOS4wNDIzNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1809.04234"},{"id":"eyJwYXBlcklEIjoiMjAwOS4wODA1MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2009.08052"},{"id":"eyJwYXBlcklEIjoiMjEwOC4wNzEwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2108.07107"},{"id":"eyJwYXBlcklEIjoiMjIwMS4wOTcwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2201.09708"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xMzg5MiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.13892"}]}]}},{"author":"xiaojun xu","node":{"id":"eyJhZGRyZXNzIjoieHV4akBhcGV4LnNqdHUuZWR1LmNuIn0=","address":"xuxj@apex.sjtu.edu.cn","name":null,"avatar":null,"linkedin":null,"bio":null,"site":null,"override":null,"membership":[],"paper":[{"modelsAggregate":{"count":0}},{"modelsAggregate":{"count":0}}],"github":[],"scholar":[{"thirdPartyID":"rdMZZQwAAAAJ"}],"twitter":[],"location":[],"owner":[{"id":"eyJ1aWQiOiI5MzVlNmI3My1lZjY4LTQzNDgtYTVjOS0zMzM4MTk1ZTAyNzkifQ==","name":"xiaojun xu","github":[],"email":[],"authored":[{"id":"eyJwYXBlcklEIjoiMTcwOC4wNjUyNSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1708.06525"},{"id":"eyJwYXBlcklEIjoiMjAwMy4wODkwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2003.08904"},{"id":"eyJwYXBlcklEIjoiMTkxMC4wMzEzNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1910.03137"},{"id":"eyJwYXBlcklEIjoiMTcxMi4wMDUwNCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"1712.00504"},{"id":"eyJwYXBlcklEIjoiMjAwMi4xMjM5OCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2002.12398"},{"id":"eyJwYXBlcklEIjoiMjEwNy4xMDg3MyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2107.10873"},{"id":"eyJwYXBlcklEIjoiMjIwNy4xMDMwOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2207.10308"},{"id":"eyJwYXBlcklEIjoiMjIxMC4xMTYyMCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2210.11620"},{"id":"eyJwYXBlcklEIjoiMjEwNC4wMDY3MSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2104.00671"},{"id":"eyJwYXBlcklEIjoiMjIwMi4wMTgzMiIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2202.01832"},{"id":"eyJwYXBlcklEIjoiMjMxMC4xMTg2NSIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2310.11865"},{"id":"eyJwYXBlcklEIjoiMjEwMi4xMzE4NCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2102.13184"},{"id":"eyJwYXBlcklEIjoiMjIxMi4xMzYwNyIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","paperID":"2212.13607"},{"id":"eyJwYXBlcklEIjoiNTMwMDkiLCJwdWJsaXNoZXIiOiJuZXVyaXBzIn0=","publisher":"neurips","paperID":"53009"}]}]}}]},"__typename":"paper","authorArray":["Rui Luo","Weinan Zhang","Xiaojun Xu","Jun Wang"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"1712.00504","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"1712.00504","publisher":"arxiv","paperJSON":{"title":"A Neural Stochastic Volatility Model","paperID":"1712.00504","avgLineHeight":10.96,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"text":"In this paper, we show that the recent integration of statistical models with deep recurrent neural networks provides a new way of formulating volatility (the degree of variation of time series) models that have been widely used in time series analysis and prediction in finance. The model comprises a pair of complementary stochastic recurrent neural networks: the generative network models the joint distribution of the stochastic volatility process; the inference network approximates the conditional distribution of the latent variables given the observables. Our focus here is on the formulation of temporal dynamics of volatility over time under a stochastic recurrent neural network framework. Experiments on real-world stock price datasets demonstrate that the proposed model generates a better volatility estimation and prediction that outperforms mainstream methods, e.g., deterministic models such as GARCH and its variants, and stochastic models namely the MCMC-based model ","element":"span"},{"style":{"fontStyle":"italic"},"text":"stochvol ","element":"span"},{"text":"as well as the Gaussian process volatility model ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GPVol","element":"span"},{"text":", on average negative log-likelihood.","element":"span"}]]},{"heading":"Introduction","paragraphs":[[{"text":"The volatility of the price movements reflects the ubiquitous uncertainty within financial markets. It is critical that the level of risk (aka, the degree of variation), indicated by volatility, is taken into consideration before investment decisions are made and portfolio are optimised (","element":"span"},{"href":"#id-0","referenceIndex":15,"text":"Hull ","element":"a"},{"href":"#id-0","referenceIndex":15,"text":"2006","element":"a"},{"text":"); volatility is substantially a key variable in the pricing of derivative securities. Hence, estimating and forecasting volatility is of great importance in branches of financial studies, including investment, risk management, security valuation and monetary policy making (","element":"span"},{"href":"#id-1","referenceIndex":25,"text":"Poon and Granger 2003","element":"a"},{"text":").","element":"span"}],[{"text":"Volatility is measured typically by employing the standard deviation of price change in a fixed time interval, such as a day, a month or a year. The higher the volatility is, the riskier the asset should be. One of the primary challenges in designing volatility models is to identify the existence of latent stochastic processes and to characterise the underlying dependences or interactions between variables within a certain time span. A classic approach has been to handcraft the characteristic features of volatility models by imposing assumptions and constraints, given prior knowledge and observations. Notable examples include autoregressive conditional heteroscedasticity (ARCH) model (","element":"span"},{"href":"#id-2","referenceIndex":9,"text":"Engle 1982","element":"a"},{"text":") and the extension, generalised ARCH (GARCH) (","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"Bollerslev ","element":"a"},{"href":"#id-3","referenceIndex":4,"text":"1986","element":"a"},{"text":"), which makes use of autoregression to capture the properties of time-varying volatility within many time series. As an alternative to the GARCH model family, the class of stochastic volatility (SV) models specify the variance to follow some latent stochastic process (","element":"span"},{"href":"#id-4","referenceIndex":14,"text":"Hull and White ","element":"a"},{"href":"#id-4","referenceIndex":14,"text":"1987","element":"a"},{"text":"). Heston (","element":"span"},{"href":"#id-5","referenceIndex":12,"text":"Heston 1993","element":"a"},{"text":") proposed a continuous-time model with the volatility following an Ornstein-Uhlenbeck process and derived a closed-form solution for options pricing. Since the temporal discretisation of continuous-time dynamics sometimes leads to a deviation from the original trajectory of system, those continuous-time models are seldom applied in forecasting. For practical purposes of forecasting, the canonical model (","element":"span"},{"href":"#id-6","referenceIndex":16,"text":"Jacquier, Polson, and Rossi 2002","element":"a"},{"text":"; ","element":"span"},{"href":"#id-7","referenceIndex":18,"text":"Kim, Shephard, and Chib 1998","element":"a"},{"text":") formulated in a discrete-time fashion for regularly spaced data such as daily prices of stocks is of great interest. While theoretically sound, those approaches require strong assumptions which might involve detailed insight of the target sequences and are difficult to determine without a thorough inspection.","element":"span"}],[{"text":"In this paper, we take a fully data driven approach and determine the configurations with as few exogenous input as possible, or even purely from the historical data. We propose a neural network re-formulation of stochastic volatility by leveraging stochastic models and recurrent neural networks (RNNs). In inspired by the work from Chung et al. (","element":"span"},{"href":"#id-8","referenceIndex":7,"text":"Chung et al. 2015","element":"a"},{"text":") and Fraccaro et al. (","element":"span"},{"href":"#id-9","referenceIndex":10,"text":"Fraccaro et al. 2016","element":"a"},{"text":"), the proposed model is rooted in variational inference and equipped with the latest advances of stochastic neural networks. The model inherits the fundamentals of SV model and provides a general framework for volatility modelling; it extends previous sequential frameworks with autoregressive and bidirectional architecture and provide with a more systematic and volatility-specific formulation on stochastic volatility modelling for financial time series. We presume that the latent variables follow a Gaussian autoregressive process, which is then utilised to model the variance process. Our neural network formulation is essentially a general framework for volatility modelling, which covers two major classes of volatility models in financial study as the special cases with specific weights and activations on neurons.","element":"span"}],[{"text":"Experiments with real-world stock price datasets are performed. The result shows that the proposed model produces more accurate estimation and prediction, outperforming various widely-used deterministic models in the GARCH family and several recently proposed stochastic models on average negative log-likelihood; the high flexibility and rich expressive power are validated.","element":"span"}]]},{"heading":"Related Work","paragraphs":[[{"text":"A notable framework for volatility is autoregressive conditional heteroscedasticity (ARCH) model (","element":"span"},{"href":"#id-2","referenceIndex":9,"text":"Engle 1982","element":"a"},{"text":"): it can accurately identify the characteristics of time-varying volatility within many types of time series. Inspired by ARCH model, a large body of diverse work based on stochastic process for volatility modelling has emerged (","element":"span"},{"href":"#id-10","referenceIndex":3,"text":"Bollerslev, Engle, and Nelson 1994","element":"a"},{"text":"). Bollerslev (","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"Bollerslev ","element":"a"},{"href":"#id-3","referenceIndex":4,"text":"1986","element":"a"},{"text":") generalised ARCH model to the generalised autoregressive conditional heteroscedasticity (GARCH) model in a manner analogous to the extension from autoregressive (AR) model to autoregressive moving average (ARMA) model by introducing the past conditional variances in the current conditional variance estimation. Engle and Kroner (","element":"span"},{"href":"#id-11","referenceIndex":8,"text":"Engle and ","element":"a"},{"href":"#id-11","referenceIndex":8,"text":"Kroner 1995","element":"a"},{"text":") presented theoretical results on the formulation and estimation of multivariate GARCH model within simultaneous equations systems. The extension to multivariate model allows the covariance to present and depend on the historical information, which are particularly useful in multivariate financial models. An alternative to the conditionally deterministic GARCH model family is the class of stochastic volatility (SV) models, which first appeared in the theoretical finance literature on option pricing (","element":"span"},{"href":"#id-4","referenceIndex":14,"text":"Hull ","element":"a"},{"href":"#id-4","referenceIndex":14,"text":"and White 1987","element":"a"},{"text":"). The SV models specify the variance to follow some latent stochastic process such that the current volatility is no longer a deterministic function even if the historical information is provided. As an example, Heston’s model (","element":"span"},{"href":"#id-5","referenceIndex":12,"text":"Heston 1993","element":"a"},{"text":") characterises the variance process as a Cox-Ingersoll-Ross process driven by a latent Wiener process. While theoretically sound, those approaches require strong assumptions which might involve complex probability distributions and non-linear dynamics that drive the process. Nevertheless, empirical evidences have confirmed that volatility models provide accurate prediction (","element":"span"},{"href":"#id-12","referenceIndex":1,"text":"Ander- ","element":"a"},{"href":"#id-12","referenceIndex":1,"text":"sen and Bollerslev 1998","element":"a"},{"text":") and models such as ARCH and its descendants/variants have become indispensable tools in asset pricing and risk evaluation. Notably, several models have been recently proposed for practical forecasting tasks: Kastner et al. (","element":"span"},{"href":"#id-13","referenceIndex":17,"text":"Kastner and Fr¨uhwirth-Schnatter 2014","element":"a"},{"text":") implemented the MCMC-based framework ","element":"span"},{"style":{"fontStyle":"italic"},"text":"stochvol ","element":"span"},{"text":"where the ancillarity-sufficiency interweaving strategy (ASIS) is applied for boosting MCMC estimation of stochastic volatility; Wu et al. (","element":"span"},{"href":"#id-14","referenceIndex":31,"text":"Wu, Hern´andez-Lobato, and Ghahramani 2014","element":"a"},{"text":") designed the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GP-Vol","element":"span"},{"text":", a non-parametric model which utilises Gaussian processes to characterise the dynamics and jointly learns the process and hidden states via online inference algorithm. Despite the fact that it provides us with a practical approach towards stochastic volatility forecasting, both models require a relatively large volume of samples to ensure the accuracy, which involves very expensive sampling routine at each time step. Another drawback is that those models are incapable to handle the forecasting task for multivariate time series.","element":"span"}],[{"text":"On the other hand, deep learning (","element":"span"},{"href":"#id-15","referenceIndex":22,"text":"LeCun, Bengio, and ","element":"a"},{"href":"#id-15","referenceIndex":22,"text":"Hinton 2015","element":"a"},{"text":"; ","element":"span"},{"href":"#id-16","referenceIndex":27,"text":"Schmidhuber 2015","element":"a"},{"text":") that utilises nonlinear structures known as deep neural networks, powers various applications. It has triumph over pattern recognition challenges, such as image recognition (","element":"span"},{"href":"#id-17","referenceIndex":21,"text":"Krizhevsky, Sutskever, ","element":"a"},{"href":"#id-17","referenceIndex":21,"text":"and Hinton 2012","element":"a"},{"text":"), speech recognition (","element":"span"},{"href":"#id-18","referenceIndex":6,"text":"Chorowski et al. ","element":"a"},{"href":"#id-18","referenceIndex":6,"text":"2015","element":"a"},{"text":"), machine translation (","element":"span"},{"href":"#id-19","referenceIndex":2,"text":"Bahdanau, Cho, and Bengio ","element":"a"},{"href":"#id-19","referenceIndex":2,"text":"2014","element":"a"},{"text":") to name a few.","element":"span"}],[{"text":"Time-dependent neural networks models include RNNs with neuron structures such as long short-term memory (LSTM) (","element":"span"},{"href":"#id-20","referenceIndex":13,"text":"Hochreiter and Schmidhuber 1997","element":"a"},{"text":"), bidirectional RNN (BRNN) (","element":"span"},{"href":"#id-21","referenceIndex":28,"text":"Schuster and Paliwal 1997","element":"a"},{"text":"), gated recurrent unit (GRU) (","element":"span"},{"href":"#id-22","referenceIndex":5,"text":"Cho et al. 2014","element":"a"},{"text":") and attention mechanism (","element":"span"},{"href":"#id-23","referenceIndex":32,"text":"Xu et al. 2015","element":"a"},{"text":"). Recent results show that RNNs excel for sequence modelling and generation in various applications (","element":"span"},{"href":"#id-24","referenceIndex":30,"text":"van den Oord, Kalchbrenner, and Kavukcuoglu 2016","element":"a"},{"text":"; ","element":"span"},{"href":"#id-22","referenceIndex":5,"text":"Cho et al. 2014","element":"a"},{"text":"; ","element":"span"},{"href":"#id-23","referenceIndex":32,"text":"Xu et al. 2015","element":"a"},{"text":"). However, despite its capability as non-linear universal approximator, one of the drawbacks of neural networks is its deterministic nature. Adding latent variables and their processes into neural networks would easily make the posteriori computationally intractable. Recent work shows that efficient inference can be found by variational inference when hidden continuous variables are embedded into the neural networks structure (","element":"span"},{"href":"#id-25","referenceIndex":20,"text":"Kingma and Welling 2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-26","referenceIndex":26,"text":"Rezende, Mohamed, and Wier- ","element":"a"},{"href":"#id-26","referenceIndex":26,"text":"stra 2014","element":"a"},{"text":"). Some early work has started to explore the use of variational inference to make RNNs stochastic: Chung et al. (","element":"span"},{"href":"#id-8","referenceIndex":7,"text":"Chung et al. 2015","element":"a"},{"text":") defined a sequential framework with complex interacting dynamics of coupling observable and latent variables whereas Fraccaro et al. (","element":"span"},{"href":"#id-9","referenceIndex":10,"text":"Fraccaro et al. ","element":"a"},{"href":"#id-9","referenceIndex":10,"text":"2016","element":"a"},{"text":") utilised heterogeneous backward propagating layers in inference network according to its Markovian properties.","element":"span"}],[{"text":"In this paper, we apply the stochastic neural networks to solve the volatility modelling problem. In other words, we model the dynamics and stochastic nature of the degree of variation, not only the mean itself. Our neural network treatment of volatility modelling is a general one and existing volatility models (e.g., the Heston and GARCH models) are special cases in our formulation.","element":"span"}]]},{"heading":"Preliminaries: Volatility Models","paragraphs":[[{"text":"Volatility models characterise the dynamics of volatility processes, and help estimate and forecast the fluctuation within time series. As it is often the case that one seeks for prediction on quantity of interest with a collection of historical information at hand, we presume the conditional variance to have dependency – either deterministic or stochastic – on history, which results in two categories of volatility models.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Deterministic Volatility Models: the GARCH Model Family","element":"span"}],[{"text":"The GARCH model family comprises various linear models that formulate the conditional variance at present as a linear function of observations and variances from the past. Bollerslev’s extension (","element":"span"},{"href":"#id-3","referenceIndex":4,"text":"Bollerslev 1986","element":"a"},{"text":") of Engle’s primitive ARCH model (","element":"span"},{"href":"#id-2","referenceIndex":9,"text":"Engle 1982","element":"a"},{"text":"), referred as generalised ARCH (GARCH) model, is one of the most well-studied","element":"span"}],[{"text":"and widely-used volatility models:","element":"span"}],[{"id":"id-27","style":{"width":"78%"},"width":750,"height":183,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/2-0.png","element":"img"}],[{"text":"where Eq. (","element":"span"},{"href":"#id-27","text":"2","element":"a"},{"text":") represents the assumption that the observation ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"style":{"height":6.8},"width":10,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/2-1.png","element":"img","alt":"t","inline":true,"padRight":true},{"text":"follows from the Gaussian distribution with mean 0 and variance ","element":"span"},{"style":{"height":17.55},"width":43.54,"height":43.87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/2-2.png","element":"img","alt":" σ2t","inline":true,"padRight":true},{"text":"; the (conditional) variance ","element":"span"},{"style":{"height":17.55},"width":43.54,"height":43.87,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/2-3.png","element":"img","alt":" σ2t","inline":true,"padRight":true},{"text":"is fully de- ","element":"span"},{"text":"termined by a linear function (Eq. (","element":"span"},{"href":"#id-27","text":"1","element":"a"},{"text":")) of previous observations ","element":"span"},{"style":{"height":14.8},"width":88.98,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/2-4.png","element":"img","alt":" {x 2 × 1010","inline":true},{"text":". Specifically, the actual dataset for training and evaluation comprises a collection of 2000 series of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"d","element":"span"},{"text":"-dimensional normalised log-return vectors of length ","element":"span"},{"style":{"height":14},"width":130.96,"height":35,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-1.png","element":"img","alt":" 2570 (∼","inline":true,"padRight":true},{"text":"7 years) with no missing values. We divide the whole dataset into two subsets for training and testing along the time axis: the first 2000 time steps of each series have been used as training samples whereas the rest 570 steps of each series as the test samples.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Baselines ","element":"span"},{"text":"We select several deterministic volatility models from the GARCH family as baselines: 1. Quadratic models","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"ARCH(1); GARCH(1,1); GJR-GARCH(1,1,1);","element":"span"}],[{"text":"2. Absolute value models","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"AVARCH(1); AVGARCH(1,1); TARCH(1,1,1);","element":"span"}],[{"text":"3. Exponential models.","element":"span"}],[{"style":{"fontStyle":"italic"},"text":"• ","element":"span"},{"text":"EARCH(1); EGARCH(1,1);","element":"span"}],[{"text":"Moreover, two stochastic volatility models are compared:","element":"span"}],[{"text":"1. MCMC volatility model: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"stochvol","element":"span"},{"text":";","element":"span"}],[{"text":"2. Gaussian process volatility model ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GP-Vol","element":"span"},{"text":".","element":"span"}],[{"text":"For the listed models, we retrieve the authors’ implementations or tools: ","element":"span"},{"style":{"fontStyle":"italic"},"text":"stochvol","element":"span"},{"text":"1","element":"span"},{"text":", ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GP-Vol","element":"span"},{"text":"2 ","element":"span"},{"text":"(the hyperparameters are chosen as suggested in (","element":"span"},{"href":"#id-14","referenceIndex":31,"text":"Wu, Hern´andez-Lobato, and ","element":"a"},{"href":"#id-14","referenceIndex":31,"text":"Ghahramani 2014","element":"a"},{"text":")) and implement the models, such as GARCH, EGARCH, GJR-GARCH, etc., based on several widely-used packages","element":"span"},{"text":"34","element":"span"},{"text":"5 ","element":"span"},{"text":"for time series analysis. All baselines are evaluated in terms of the negative log-likelihood on the test samples, where 1-step-ahead forecasting is carried out in a recursive fashion similar to Algorithm ","element":"span"},{"href":"#id-40","text":"1","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Model Implementation ","element":"span"},{"text":"In our experiments, we predefine the dimensions of observable variables to be ","element":"span"},{"style":{"height":13.59},"width":197.92,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-2.png","element":"img","alt":" dim xt = 6","inline":true,"padRight":true},{"text":"and the latent variables ","element":"span"},{"style":{"height":13.59},"width":181.85,"height":33.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-3.png","element":"img","alt":"dim zt = 4","inline":true},{"text":". Note that the dimension of the latent variable is smaller than that of the observable, which allows us to extract a compact representation. The NSVM implementation in our experiments is composed of two neural networks, namely the generative network (see Eq. (","element":"span"},{"href":"#id-42","text":"16","element":"a"},{"text":")-(","element":"span"},{"href":"#id-42","text":"21","element":"a"},{"text":")) and inference network (see Eq. (","element":"span"},{"href":"#id-43","text":"23","element":"a"},{"text":")-(","element":"span"},{"href":"#id-44","text":"27","element":"a"},{"text":")). Each RNN module contains one hidden layer of size ","element":"span"},{"text":"10 ","element":"span"},{"text":"with GRU cells; MLP modules are 2-layered fully-connected feedforward networks, where the hidden layer is also of size ","element":"span"},{"text":"10 ","element":"span"},{"text":"whereas the output layer splits into two equal-sized sublayers with different activation functions: one applies exponential function to ensure the non-negativity for variance while the","element":"span"}],[{"text":"other uses linear function to calculate mean estimates. Thus ","element":"span"},{"style":{"height":17.3},"width":97.09,"height":43.25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-4.png","element":"img","alt":"MLPzI","inline":true},{"text":"’s output layer is of size ","element":"span"},{"text":"4 ","element":"span"},{"text":"+ ","element":"span"},{"text":"4 ","element":"span"},{"text":"for ","element":"span"},{"style":{"height":17.32},"width":136.06,"height":43.29,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-5.png","element":"img","alt":" { ˜µz, ˜Σz}","inline":true,"padRight":true},{"text":"whereas ","element":"span"},{"text":"the size of ","element":"span"},{"style":{"height":16.68},"width":106.08,"height":41.69,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-6.png","element":"img","alt":" MLPxG","inline":true},{"text":"’s output layer is ","element":"span"},{"text":"6 ","element":"span"},{"text":"+ ","element":"span"},{"text":"6 ","element":"span"},{"text":"for ","element":"span"},{"style":{"height":15.2},"width":141.12,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-7.png","element":"img","alt":" {µx, Σ x}","inline":true},{"text":". Dur- ","element":"span"},{"text":"ing the training phase, the inference network is connected with the conditional generative network (see, Eq. (","element":"span"},{"href":"#id-42","text":"16","element":"a"},{"text":")-(","element":"span"},{"href":"#id-42","text":"18","element":"a"},{"text":")) to establish a bottleneck structure, the latent variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":6.8},"width":10,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/5-8.png","element":"img","alt":"t","inline":true,"padRight":true},{"text":"inferred by variational inference (","element":"span"},{"href":"#id-25","referenceIndex":20,"text":"Kingma and Welling 2013","element":"a"},{"text":"; ","element":"span"},{"href":"#id-26","referenceIndex":26,"text":"Rezende, Mohamed, and Wierstra 2014","element":"a"},{"text":") follows a Gaussian approximate posterior; the size of sample paths is set to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"= ","element":"span"},{"text":"100","element":"span"},{"text":". The parameters of both networks are jointly learned, including those for the prior. We introduce Dropout (","element":"span"},{"href":"#id-45","referenceIndex":29,"text":"Srivastava et al. 2014","element":"a"},{"text":") into each RNN modules and impose ","element":"span"},{"style":{"fontStyle":"italic"},"text":"L","element":"span"},{"text":"2","element":"span"},{"text":"-norm on the weights of MLP modules as regularistion to prevent overshooting; Adam optimiser (","element":"span"},{"href":"#id-46","referenceIndex":19,"text":"Kingma and Ba ","element":"a"},{"href":"#id-46","referenceIndex":19,"text":"2014","element":"a"},{"text":") is exploited for fast convergence; exponential learning rate decay is adopted to anneal the variations of convergence as time goes. Two covariance configurations are adopted: 1. we stick with diagonal covariance matrices con-figurations; 2. we start with diagonal covariance and then apply rank-1 perturbation (","element":"span"},{"href":"#id-26","referenceIndex":26,"text":"Rezende, Mohamed, and Wierstra ","element":"a"},{"href":"#id-26","referenceIndex":26,"text":"2014","element":"a"},{"text":") during fine-tuning until training is finished. The recursive 1-step-ahead forecasting routine illustrated as Algorithm ","element":"span"},{"href":"#id-40","text":"1 ","element":"a"},{"text":"$31","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Result and Discussion","element":"span"}],[{"text":"The performance of NSVM and baselines is listed for comparison in Table ","element":"span"},{"href":"#id-47","text":"1","element":"a"},{"text":": the performance on the first 10 individual stocks (chosen in alphacetical order but anonymised here) and the average score on all 162 stocks are reported in terms of negative log-likelihood (NLL) measure. The result shows that NSVM has achieved higher accuracy over the baselines on the task of volatility modelling and forecasting on NLL, which validates the high flexibility and rich expressive power of NSVM for volatility modelling and forecasting. In particular, NSVM with rank-1 perturbation (referred to as NSVM-corr in Table ","element":"span"},{"href":"#id-47","text":"1","element":"a"},{"text":") beats all other models in terms of NLL, while NSVM with diagonal covariance matrix (i.e. NSVM-diag) outperforms GARCH(1,1) on 142 out of 162 stocks. Although the improvement comes at the cost of longer training time before convergence, it can be mitigated by applying parallel computing techniques as well as more advanced network architecture or training methods.","element":"span"}],[{"id":"id-47","text":"Table 1: The performance of the proposed model and the baselines in terms of negative log-likelihood (NLL) evaluated on the ","element":"figcaption","subtype":"caption"},{"text":"test samples of real-world stock price time series: each row from 1 to 10 lists the average NLL for a specific individual stock; the last row summarises the average NLL of the entire test samples of all 162 stocks.","element":"figcaption","subtype":"caption"}],[{"id":"id-48","style":{"width":"99%"},"width":948,"height":1754,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/6-0.png","element":"img"}],[{"text":"Figure 1: Case studies of volatility forecasting.","element":"figcaption","subtype":"caption"}],[{"text":"Apart from the higher accuracy NSVM obtained, it provides us with a rather general framework to generalise univariate time series models of any specific functional form to the corresponding multivariate cases by extending network dimensions and manipulating the covariance matrices. A case study on real-world financial datasets is illustrated in Fig. ","element":"span"},{"href":"#id-48","text":"1","element":"a"},{"text":".","element":"span"}],[{"text":"NSVM shows higher sensibility on drastic changes and better stability on moderate fluctuations: the response of NSVM in Fig. ","element":"span"},{"href":"#id-48","text":"1a ","element":"a"},{"text":"is more stable in ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"style":{"height":14.4},"width":243.36,"height":36,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/6-1.png","element":"img","alt":" ∈ [1600, 2250]","inline":true},{"text":", the period of moderate price fluctuation; while for drastic price change at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"= ","element":"span"},{"text":"2250","element":"span"},{"text":", the model responds with a sharper","element":"span"}],[{"style":{"width":"99%"},"width":948,"height":544,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/6-2.png","element":"img"}],[{"text":"spike compared with the quadratic GARCH model. Furthermore, NSVM demonstrates the inherent non-linearity in both Fig. ","element":"span"},{"href":"#id-48","text":"1a ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-48","text":"1b","element":"a"},{"text":": at each time step within ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"style":{"height":8},"width":21,"height":20,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/6-3.png","element":"img","alt":"∈","inline":true,"padRight":true},{"text":"[","element":"span"},{"text":"1000","element":"span"},{"text":", ","element":"span"},{"text":"2000","element":"span"},{"text":"]","element":"span"},{"text":", the model quickly adapts to the current fluctu-ation level whereas GARCH suffers from a relatively slower decay from the previous influences. The cyan vertical line at ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t ","element":"span"},{"text":"= ","element":"span"},{"text":"2000 ","element":"span"},{"text":"splits the training samples and test samples. We show only one instance within our dataset due to the limitation of pages, the performance of other instances are similar.","element":"span"}]]},{"heading":"Conclusion","paragraphs":[[{"text":"$32","element":"span"},{"style":{"fontStyle":"italic"},"text":"stochvol ","element":"span"},{"text":"as well as Gaussian process volatility model ","element":"span"},{"style":{"fontStyle":"italic"},"text":"GP-Vol","element":"span"},{"text":". Future work on NSVM would be to investigate the modelling of time series with non-Gaussian residual distributions, in particular the heavy-tailed distributions e.g. LogNormal ","element":"span"},{"text":"log ","element":"span"},{"style":{"fontStyle":"italic"},"text":"N","element":"span"},{"text":"and Student’s ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":"-distribution.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-12","text":"[Andersen and Bollerslev 1998] Andersen, T. G., and Bollerslev, T. ","element":"span"},{"text":"1998. Answering the skeptics: Yes, standard volatility models do provide accurate forecasts. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"International economic review ","element":"span"},{"text":"885– 905.","element":"span"}],[{"id":"id-19","text":"[Bahdanau, Cho, and Bengio 2014] Bahdanau, D.; Cho, K.; and ","element":"span"},{"text":"Bengio, Y. 2014. Neural machine translation by jointly learning to align and translate. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CoRR ","element":"span"},{"text":"abs/1409.0473.","element":"span"}],[{"id":"id-10","text":"[Bollerslev, Engle, and Nelson 1994] Bollerslev, T.; Engle, R. F.; ","element":"span"},{"text":"and Nelson, D. B. 1994. Arch models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Handbook of econometrics ","element":"span"},{"text":"4:2959–3038.","element":"span"}],[{"id":"id-3","text":"[Bollerslev 1986] Bollerslev, T. 1986. Generalized autoregressive ","element":"span"},{"text":"conditional heteroskedasticity. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of econometrics ","element":"span"},{"text":"31(3):307– 327.","element":"span"}],[{"id":"id-22","text":"[Cho et al. 2014] Cho, K.; van Merrienboer, B.; G¨ulc¸ehre, C¸ .; Bah- ","element":"span"},{"text":"danau, D.; Bougares, F.; Schwenk, H.; and Bengio, Y. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing","element":"span"},{"text":", 1724–1734.","element":"span"}],[{"id":"id-18","text":"[Chorowski et al. 2015] Chorowski, J.; Bahdanau, D.; Serdyuk, D.; ","element":"span"},{"text":"Cho, K.; and Bengio, Y. 2015. Attention-based models for speech recognition. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 28","element":"span"},{"text":", 577–585.","element":"span"}],[{"id":"id-8","text":"[Chung et al. 2015] Chung, J.; Kastner, K.; Dinh, L.; Goel, K.; ","element":"span"},{"text":"Courville, A. C.; and Bengio, Y. 2015. A recurrent latent variable model for sequential data. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 28","element":"span"},{"text":", 2980–2988.","element":"span"}],[{"id":"id-11","text":"[Engle and Kroner 1995] Engle, R. F., and Kroner, K. F. ","element":"span"},{"text":"1995. Multivariate simultaneous generalized arch. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Econometric theory ","element":"span"},{"text":"11(01):122–150.","element":"span"}],[{"id":"id-2","text":"[Engle 1982] Engle, R. F. 1982. Autoregressive conditional het- ","element":"span"},{"text":"eroscedasticity with estimates of the variance of united kingdom inflation. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Econometrica: Journal of the Econometric Society ","element":"span"},{"text":"987– 1007.","element":"span"}],[{"id":"id-9","text":"[Fraccaro et al. 2016] Fraccaro, M.; Sønderby, S. K.; Paquet, U.; ","element":"span"},{"text":"and Winther, O. 2016. Sequential neural models with stochastic layers. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 29","element":"span"},{"text":", 2199–2207.","element":"span"}],[{"id":"id-28","text":"[Glosten, Jagannathan, and Runkle 1993] Glosten, L. R.; Jagan- ","element":"span"},{"text":"nathan, R.; and Runkle, D. E. 1993. On the relation between the expected value and the volatility of the nominal excess return on stocks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The journal of finance ","element":"span"},{"text":"48(5):1779–1801.","element":"span"}],[{"id":"id-5","text":"[Heston 1993] Heston, S. L. 1993. A closed-form solution for op- ","element":"span"},{"text":"tions with stochastic volatility with applications to bond and currency options. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Review of financial studies ","element":"span"},{"text":"6(2):327–343.","element":"span"}],[{"id":"id-20","text":"[Hochreiter and Schmidhuber 1997] Hochreiter, S., and Schmidhu- ","element":"span"},{"text":"ber, J. ","element":"span"},{"text":"1997. ","element":"span"},{"text":"Long short-term memory. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural Computation ","element":"span"},{"text":"9(8):1735–1780.","element":"span"}],[{"id":"id-4","text":"[Hull and White 1987] Hull, J., and White, A. 1987. The pricing of ","element":"span"},{"text":"options on assets with stochastic volatilities. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The journal of finance ","element":"span"},{"text":"42(2):281–300.","element":"span"}],[{"id":"id-0","text":"[Hull 2006] Hull, J. C. 2006. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Options, futures, and other derivatives","element":"span"},{"text":". Pearson Education India.","element":"span"}],[{"id":"id-6","text":"[Jacquier, Polson, and Rossi 2002] Jacquier, E.; Polson, N. G.; and ","element":"span"},{"text":"Rossi, P. E. 2002. Bayesian analysis of stochastic volatility models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Business & Economic Statistics ","element":"span"},{"text":"20(1):69–87.","element":"span"}],[{"id":"id-13","text":"[Kastner and Fr¨uhwirth-Schnatter 2014] Kastner, ","element":"span"},{"text":"G., ","element":"span"},{"text":"and Fr¨uhwirth-Schnatter, S. ","element":"span"},{"text":"2014. ","element":"span"},{"text":"Ancillarity-sufficiency interweaving strategy (ASIS) for boosting MCMC estimation of stochastic volatility models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Computational Statistics & Data Analysis ","element":"span"},{"text":"76:408–423.","element":"span"}],[{"id":"id-7","text":"[Kim, Shephard, and Chib 1998] Kim, S.; Shephard, N.; and Chib, ","element":"span"},{"text":"S. 1998. Stochastic volatility: likelihood inference and comparison with arch models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"The review of economic studies ","element":"span"},{"text":"65(3):361–393.","element":"span"}],[{"id":"id-46","text":"[Kingma and Ba 2014] Kingma, D. P., and Ba, J. 2014. Adam: A ","element":"span"},{"text":"method for stochastic optimization. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CoRR ","element":"span"},{"text":"abs/1412.6980.","element":"span"}],[{"id":"id-25","text":"[Kingma and Welling 2013] Kingma, D. P., and Welling, M. 2013. ","element":"span"},{"text":"Auto-encoding variational bayes. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"CoRR ","element":"span"},{"text":"abs/1312.6114.","element":"span"}],[{"id":"id-17","text":"[Krizhevsky, Sutskever, and Hinton 2012] Krizhevsky, ","element":"span"},{"text":"A.; Sutskever, I.; and Hinton, G. E. ","element":"span"},{"text":"2012. ","element":"span"},{"text":"Imagenet classifica-tion with deep convolutional neural networks. ","element":"span"},{"text":"In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 25","element":"span"},{"text":", 1106–1114.","element":"span"}],[{"id":"id-15","text":"[LeCun, Bengio, and Hinton 2015] LeCun, Y.; Bengio, Y.; and Hin- ","element":"span"},{"text":"ton, G. E. 2015. Deep learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Nature ","element":"span"},{"text":"521(7553):436–444.","element":"span"}],[{"id":"id-41","text":"[Little and Rubin 2014] Little, R. J., and Rubin, D. B. 2014. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Statistical analysis with missing data","element":"span"},{"text":". John Wiley & Sons.","element":"span"}],[{"id":"id-31","text":"[Nelson 1991] Nelson, D. B. 1991. Conditional heteroskedasticity ","element":"span"},{"text":"in asset returns: A new approach. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Econometrica: Journal of the Econometric Society ","element":"span"},{"text":"347–370.","element":"span"}],[{"id":"id-1","text":"[Poon and Granger 2003] Poon, S.-H., and Granger, C. W. 2003. ","element":"span"},{"text":"Forecasting volatility in financial markets: A review. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of economic literature ","element":"span"},{"text":"41(2):478–539.","element":"span"}],[{"id":"id-26","text":"[Rezende, Mohamed, and Wierstra 2014] Rezende, ","element":"span"},{"text":"D. ","element":"span"},{"text":"J.; ","element":"span"},{"text":"Mohamed, S.; and Wierstra, D. 2014. Stochastic backpropagation and approximate inference in deep generative models. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 31th International Conference on Machine Learning","element":"span"},{"text":", 1278–1286.","element":"span"}],[{"id":"id-16","text":"[Schmidhuber 2015] Schmidhuber, J. 2015. Deep learning in neural ","element":"span"},{"text":"networks: An overview. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Neural Networks ","element":"span"},{"text":"61:85–117.","element":"span"}],[{"id":"id-21","text":"[Schuster and Paliwal 1997] Schuster, M., and Paliwal, K. K. 1997. ","element":"span"},{"text":"Bidirectional recurrent neural networks. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"IEEE Trans. Signal Processing ","element":"span"},{"text":"45(11):2673–2681.","element":"span"}],[{"id":"id-45","text":"[Srivastava et al. 2014] Srivastava, N.; Hinton, G. E.; Krizhevsky, ","element":"span"},{"text":"A.; Sutskever, I.; and Salakhutdinov, R. 2014. Dropout: a simple way to prevent neural networks from overfitting. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Machine Learning Research ","element":"span"},{"text":"15(1):1929–1958.","element":"span"}],[{"id":"id-24","text":"[van den Oord, Kalchbrenner, and Kavukcuoglu 2016] van ","element":"span"},{"text":"den Oord, A.; Kalchbrenner, N.; and Kavukcuoglu, K. 2016. Pixel recurrent neural networks. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 33nd International Conference on Machine Learning","element":"span"},{"text":", 1747–1756.","element":"span"}],[{"id":"id-14","text":"[Wu, Hern´andez-Lobato, and Ghahramani 2014] Wu, ","element":"span"},{"text":"Y.; Hern´andez-Lobato, J. M.; and Ghahramani, Z. ","element":"span"},{"text":"2014. ","element":"span"},{"text":"Gaussian process volatility model. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems 27","element":"span"},{"text":", 1044–1052.","element":"span"}],[{"id":"id-23","text":"[Xu et al. 2015] Xu, K.; Ba, J.; Kiros, R.; Cho, K.; Courville, A. C.; ","element":"span"},{"text":"Salakhutdinov, R.; Zemel, R. S.; and Bengio, Y. 2015. Show, attend and tell: Neural image caption generation with visual attention. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the 32nd International Conference on Machine Learning","element":"span"},{"text":", 2048–2057.","element":"span"}],[{"id":"id-29","text":"[Zakoian 1994] Zakoian, J.-M. 1994. Threshold heteroskedastic ","element":"span"},{"text":"models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Journal of Economic Dynamics and control ","element":"span"},{"text":"18(5):931– 955.","element":"span"}]]},{"heading":"Learning Parameters / Calibration","paragraphs":[[{"text":"Given the observations ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X ","element":"span"},{"style":{"height":14.8},"width":136.77,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-0.png","element":"img","alt":" = {x1:T }","inline":true},{"text":", we target at maximising the marginal log-likelihood ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":77.24,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-1.png","element":"img","alt":"Φ(X)","inline":true,"padRight":true},{"text":"w.r.t. ","element":"span"},{"style":{"height":10.4},"width":27,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-2.png","element":"img","alt":" Φ","inline":true},{"text":", where the actual posterior ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":115.82,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-3.png","element":"img","alt":"Φ(Z|X)","inline":true,"padRight":true},{"text":"is involved. Because of the intractability of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":115.81,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-4.png","element":"img","alt":"Φ(Z|X)","inline":true},{"text":", the exact inference is not applicable; we have to seek for approximate solutions instead.","element":"span"}],[{"text":"We factorise the marginal log-likelihood ","element":"span"},{"style":{"height":15.2},"width":156.45,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-5.png","element":"img","alt":" log pΦ(X)","inline":true,"padRight":true},{"text":"as","element":"span"}],[{"id":"id-49","style":{"width":"93%"},"width":893,"height":437,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-6.png","element":"img"}],[{"text":"Note that we have introduced a tractable, ","element":"span"},{"style":{"height":10.4},"width":26,"height":26,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-7.png","element":"img","alt":" Ψ","inline":true},{"text":"-parameterised distribution ","element":"span"},{"style":{"fontStyle":"italic"},"text":"q","element":"span"},{"style":{"height":15.2},"width":115.46,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-8.png","element":"img","alt":"Ψ(Z|X)","inline":true,"padRight":true},{"text":"from a flexible family of distributions to approximate the actual posterior ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":115.81,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-9.png","element":"img","alt":"Φ(Z|X)","inline":true},{"text":". The evidence lower bound (ELBO) ","element":"span"},{"style":{"height":14.8},"width":205.28,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-10.png","element":"img","alt":" L[q; X, Φ, Ψ]","inline":true,"padRight":true},{"text":"in Eq. (","element":"span"},{"href":"#id-49","text":"30","element":"a"},{"text":") is essentially a functional w.r.t. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"q","element":"span"},{"text":", conditioning on the observations ","element":"span"},{"style":{"fontStyle":"italic"},"text":"X ","element":"span"},{"text":"and parameterised by the parameter sets ","element":"span"},{"style":{"height":12.8},"width":67.06,"height":32,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-11.png","element":"img","alt":" Φ, Ψ","inline":true,"padRight":true},{"text":"of both generative and inference models. Theoretically, ELBO ensures a lower bound on the marginal log-likelihood, and can be maximised via gradient-based optimisers.","element":"span"}],[{"text":"It is usually the case that Eq. (","element":"span"},{"href":"#id-49","text":"30","element":"a"},{"text":") lacks a closed-form expression. We have to estimate the ELBO using Monte Carlo integration on the latent variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":6.8},"width":10,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-12.png","element":"img","alt":"t","inline":true},{"text":". Provided ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"sample paths drawn by the inference model defined in Eq. (","element":"span"},{"href":"#id-39","text":"22","element":"a"},{"text":"), the estimator of ELBO can be calculated as the path average:","element":"span"}],[{"style":{"width":"99%"},"width":954,"height":118,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-13.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":22.12},"width":122.49,"height":55.3,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-14.png","element":"img","alt":" {z⟨1:S⟩1:T }","inline":true,"padRight":true},{"text":"denotes the collection of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"S ","element":"span"},{"text":"sample paths.","element":"span"}],[{"text":"By assuming the latent variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":6.8},"width":10,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-15.png","element":"img","alt":"t","inline":true,"padRight":true},{"text":"being Gaussian, we can readily apply the reparameterisation technique (","element":"span"},{"href":"#id-25","referenceIndex":20,"text":"Kingma and ","element":"a"},{"href":"#id-25","referenceIndex":20,"text":"Welling 2013","element":"a"},{"text":") to ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":6.8},"width":10,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-16.png","element":"img","alt":"t","inline":true,"padRight":true},{"text":"to form an unbiased gradient estimator:","element":"span"}],[{"style":{"width":"78%"},"width":750,"height":56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-17.png","element":"img"}],[{"id":"id-50","text":"where ","element":"span"},{"style":{"height":20.63},"width":236.45,"height":51.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-18.png","element":"img","alt":" ϵ ⟨s⟩t ∼ N(0, Iz)","inline":true,"padRight":true},{"text":"is the standard Gaussian variable.","element":"span"}],[{"style":{"width":"47%"},"width":456,"height":569,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-19.png","element":"img"}],[{"text":"Figure 2: Generalised stochastic volatility model.","element":"figcaption","subtype":"caption"}],[{"text":"The reparameterisation extracts the randomness out of the latent variable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":20.63},"width":41.78,"height":51.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-20.png","element":"img","alt":"⟨s⟩t","inline":true,"padRight":true},{"text":"via ","element":"span"},{"style":{"height":20.63},"width":59.86,"height":51.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-21.png","element":"img","alt":" ϵ ⟨s⟩t","inline":true,"padRight":true},{"text":", leaving ","element":"span"},{"style":{"height":16.11},"width":35.41,"height":40.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-22.png","element":"img","alt":" ˜µzt","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.04},"width":37.92,"height":45.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-23.png","element":"img","alt":" ˜Azt","inline":true,"padRight":true},{"text":", i.e. the mean ","element":"span"},{"text":"and standard deviation of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":20.63},"width":41.79,"height":51.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-24.png","element":"img","alt":"⟨s⟩t","inline":true,"padRight":true},{"text":", being deterministic functions. It guarantees that the gradient-based optimisation techniques are applicable by isolating the model parameters ( ","element":"span"},{"style":{"height":16.11},"width":35.41,"height":40.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-25.png","element":"img","alt":" ˜µzt","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"height":18.04},"width":37.92,"height":45.1,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-26.png","element":"img","alt":" ˜Azt","inline":true,"padRight":true},{"text":") ","element":"span"},{"text":"from the sampling procedure (involving ","element":"span"},{"style":{"height":20.63},"width":59.86,"height":51.58,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-27.png","element":"img","alt":" ϵ ⟨s⟩t","inline":true,"padRight":true},{"text":").","element":"span"}],[{"id":"id-51","style":{"width":"92%"},"width":887,"height":913,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-28.png","element":"img"}],[{"text":"Figure 3: Two components of the generative network.","element":"figcaption","subtype":"caption"}]]},{"heading":"Illustrations of stochastic volatility modelling, training and forecasting","paragraphs":[[{"style":{"width":"100%"},"width":954,"height":184,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-29.png","element":"img"}],[{"text":"By introducing hidden state ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"style":{"height":16.11},"width":14.84,"height":40.28,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-30.png","element":"img","alt":"zt","inline":true,"padRight":true},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"h","element":"span"},{"style":{"height":15.26},"width":17.72,"height":38.15,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-31.png","element":"img","alt":"xt","inline":true,"padRight":true},{"text":"as memory for his- ","element":"span"},{"text":"torical information integration, the formulation is essentially equivalent to the recurrent model illustrated as Fig. ","element":"span"},{"href":"#id-50","text":"2","element":"a"},{"text":".","element":"span"}],[{"text":"We decompose the recurrent model into two components in a similar way as one would apply in factorising ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":119.28,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-32.png","element":"img","alt":"Φ(X, Z)","inline":true,"padRight":true},{"text":"into the marginal distribution ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":74.97,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-33.png","element":"img","alt":"Φ(Z)","inline":true,"padRight":true},{"text":"in Eq. (","element":"span"},{"href":"#id-36","text":"13","element":"a"},{"text":") and the conditional distribution ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":115.82,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-34.png","element":"img","alt":"Φ(X|Z)","inline":true,"padRight":true},{"text":"in Eq. (","element":"span"},{"href":"#id-36","text":"14","element":"a"},{"text":").","element":"span"}],[{"text":"The marginal ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":74.97,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-35.png","element":"img","alt":"Φ(Z)","inline":true,"padRight":true},{"text":"is implemented by","element":"span"}],[{"style":{"width":"54%"},"width":518,"height":164,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-36.png","element":"img"}],[{"text":"which represents an autoregressive network for the latent ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"style":{"height":6.8},"width":10,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-37.png","element":"img","alt":"t","inline":true,"padRight":true},{"text":"as illustrated in Fig. ","element":"span"},{"href":"#id-51","text":"3a","element":"a"},{"text":". The conditional ","element":"span"},{"style":{"fontStyle":"italic"},"text":"p","element":"span"},{"style":{"height":15.2},"width":115.81,"height":38,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-38.png","element":"img","alt":"Φ(X|Z)","inline":true,"padRight":true},{"text":"is built as","element":"span"}],[{"style":{"width":"59%"},"width":572,"height":152,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-39.png","element":"img"}],[{"text":"which corresponds to a conditional generative network for the observable ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"style":{"height":6.8},"width":10,"height":17,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/8-40.png","element":"img","alt":"t","inline":true,"padRight":true},{"text":"as in Fig. ","element":"span"},{"href":"#id-51","text":"3b","element":"a"},{"text":".","element":"span"}],[{"id":"id-53","style":{"width":"72%"},"width":1461,"height":537,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/9-0.png","element":"img"}],[{"text":"Figure 4: Illustration of the training setup.","element":"figcaption","subtype":"caption"}],[{"id":"id-52","style":{"width":"54%"},"width":522,"height":746,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/9-1.png","element":"img"}],[{"text":"Figure 5: Architecture of the inference network.","element":"figcaption","subtype":"caption"}],[{"text":"On the other hand, the inference network is implemented in a similar recurrent fashion, as an autoregressive network with bidirectional dependencies:","element":"span"}],[{"style":{"width":"70%"},"width":670,"height":303,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/9-2.png","element":"img"}],[{"text":"The architecture of inference network is illustrated in Fig. ","element":"span"},{"href":"#id-52","text":"5","element":"a"},{"text":".","element":"span"}],[{"text":"The training procedure involves the inference network (in Fig. ","element":"span"},{"href":"#id-52","text":"5","element":"a"},{"text":") and the conditional generative network (in Fig. ","element":"span"},{"href":"#id-51","text":"3b","element":"a"},{"text":"); the autoregressive network (in Fig. ","element":"span"},{"href":"#id-51","text":"3a","element":"a"},{"text":") will not be utilised.","element":"span"}],[{"text":"The historical observations ","element":"span"},{"style":{"height":14.8},"width":88.98,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/1712.00504/images/9-3.png","element":"img","alt":" {x