We start with some R warm-up.

# Numerical value
v <- 23.5
v
## [1] 23.5
# Create a vector
apple <- c('red','green',"yellow")
apple
## [1] "red"    "green"  "yellow"
# Create a matrix
M <- matrix( c('a','a','b','c','b','a'), nrow = 2, ncol = 3, byrow = TRUE)
M
##      [,1] [,2] [,3]
## [1,] "a"  "a"  "b" 
## [2,] "c"  "b"  "a"
# Removing a variable
rm(M)

Some basic statistics with R: normal and uniform distributions.

# Normal distribution

# Let's generate a vectors of random numbers from a normal distribution
x1 <- rnorm(10000, mean=0, sd=1)
# The breaks argument specifies how many bars are in the histogram
hist(x1, probability=TRUE, breaks = 100)

# pnorm returns the integral from −\inf to q of the pdf of the normal
# distribution where q is a Z-score
# a z-score is the number of standard deviations from the mean a data point is
pnorm(0)
## [1] 0.5
pnorm(2)
## [1] 0.9772499
# qnorm function is simply the inverse of the cdf, which you can also 
# think of as the inverse of pnorm
qnorm(0.5)
## [1] 0
qnorm(0.98)
## [1] 2.053749
# Uniform distribution
punif(0.75, min = 0, max = 1)
## [1] 0.75
qunif(0.75, min = 0, max = 1)
## [1] 0.75
x2 <- runif(1000, min = 0, max = 1)
hist(x2, probability=FALSE, breaks = 100)

# Descriptive statistics
mean(x1)
## [1] -0.005080723
sd(x1)
## [1] 1.000038
var(x1)
## [1] 1.000076
min(x1)
## [1] -3.656309
max(x1)
## [1] 3.442262
median(x1)
## [1] -0.01164455
range(x1)
## [1] -3.656309  3.442262
quantile(x1)
##          0%         25%         50%         75%        100% 
## -3.65630909 -0.67498496 -0.01164455  0.66671920  3.44226164
summary(x1)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -3.656309 -0.674985 -0.011645 -0.005081  0.666719  3.442262

The volesti package

volesti is a C++ package (with an R interface) for computing estimations of volume of polytopes given by a set of points or linear inequalities or Minkowski sum of segments (zonotopes). There are two algorithms for volume estimation and algorithms for sampling, rounding and rotating polytopes.

We can download the R package from https://CRAN.R-project.org/package=volesti

# first load the volesti library
#install.packages('volesti')
library(volesti)
## Loading required package: Rcpp
packageVersion("volesti")
## [1] '1.0.3'

You have access to the documentation of volesti functions like volume computation and sampling.

help("volume")
help("sample_points")

Full documentation here: https://cran.r-project.org/web/packages/volesti/volesti.pdf

Let’s try our first volesti command to estimate the volume of a 3-dimensional cube \(\{-1\leq x_i\leq 1, x_i\in\mathbb{R}\ |\ i=1,2,3\}\)

P <- gen_cube(3,'H')
print(volume(P))
## [1] 8.023323

What is the exact volume of P? Did we obtain a good estimation?

Sampling

Sampling uniformly in the square.

library(ggplot2)

x1<-runif(1000, min = -1, max = 1)
x2<-runif(1000, min = -1, max = 1)

g<-ggplot(data.frame( x=x1, y=x2 )) + geom_point( aes(x=x, y=y))

g<-g+annotate("path",
   x=cos(seq(0,2*pi,length.out=100)),
   y=sin(seq(0,2*pi,length.out=100)),color="red")+coord_fixed()

plot(g)

Can we estimate the volume of the red ball via sampling? Solution: rejection sampling. The following computation illustrates that this will fail in (not so) high dimensions.

# run in around 1 min
for (d in 2:20) {
  num_of_points <- 100000
  count_inside <- 0

  points1 <- matrix(nrow=d, ncol=num_of_points)
  for (i in 1:d) {
    x <- runif(num_of_points, min = -1, max = 1)
    for (j in 1:num_of_points) {
      points1[i,j] <- x[j]
    }
  }

  for (i in 1:num_of_points) {
    if (norm(points1[,i], type="2") < 1) {
      count_inside <- count_inside + 1
    }
  }
  vol_estimation <- count_inside*2^d/num_of_points
  vol_exact <- pi^(d/2)/gamma(d/2+1)

  cat(d, vol_estimation, vol_exact, abs(vol_estimation- vol_exact)/
vol_exact, "\n")
}
## 2 3.14128 3.141593 9.952073e-05 
## 3 4.1864 4.18879 0.0005706194 
## 4 4.9336 4.934802 0.0002436168 
## 5 5.23456 5.263789 0.005552847 
## 6 5.09824 5.167713 0.01344362 
## 7 4.6208 4.724766 0.02200447 
## 8 4.04224 4.058712 0.004058461 
## 9 3.14368 3.298509 0.04693906 
## 10 2.42688 2.550164 0.04834357 
## 11 1.86368 1.884104 0.0108401 
## 12 1.6384 1.335263 0.2270244 
## 13 0.73728 0.9106288 0.1903616 
## 14 0.49152 0.5992645 0.1797946 
## 15 0.32768 0.3814433 0.140947 
## 16 0.65536 0.2353306 1.784848 
## 17 0 0.1409811 1 
## 18 0 0.08214589 1 
## 19 0 0.0466216 1 
## 20 0 0.02580689 1

Sampling via random walks

volesti supports 3 types of random walks

  1. Ball walk

drawing drawing drawing drawing drawing

  1. Random directions hit-and-run

drawing drawing drawing

  1. Coordinate directions hit-and-run

drawing drawing drawing

There are two important parameters cost per step and mixing time that affects the accuracy and performance of the walks. Below we illustrate this by choosing different walk steps for each walk while sampling on the 100-dimensional cube.

#run in few secs
library(ggplot2)
library(volesti)
for (step in c(1,20,100,150)){
  for (walk in c("CDHR", "RDHR", "BW")){
    P <- gen_cube(100, 'H')
    points1 <- sample_points(P,n=1000,random_walk = list("walk" = walk, "walk_length" = step))
    g<-plot(ggplot(data.frame( x=points1[1,], y=points1[2,] )) +
geom_point( aes(x=x, y=y, color=walk)) + coord_fixed(xlim = c(-1,1),
ylim = c(-1,1)) + ggtitle(sprintf("walk length=%s", step, walk)))
  }
}


Volume estimation

Now let’s compute our first example. The volume of the 3-dimensional cube.

library(geometry)

PV <- gen_cube(3,'V')
str(PV)
## Reference class 'Rcpp_Vpolytope' [package "volesti"] with 3 fields
##  $ V        : num [1:8, 1:3] -1 1 -1 1 -1 1 -1 1 -1 -1 ...
##  $ dimension: num 3
##  $ type     : int 2
##  and 16 methods, of which 2 are  possibly relevant:
##    finalize, initialize
geom_values <- convhulln(PV@V, options = 'FA')
vol_approx <- volume(PV)

cat(sprintf("exact vol = %f\napprx vol = %f\nrelative error = %f\n",
            geom_values$vol, vol_approx, abs(geom_values$vol-vol_approx)/geom_values$vol))
## exact vol = 8.000000
## apprx vol = 7.706674
## relative error = 0.036666

Now try a higher dimensional example. By setting the error parameter we can control the approximation of the algorithm.

PH = gen_cube(10,'H')
volumes <- list()
for (i in 1:10) {
  volumes[[i]] <- volume(PH, error=1) # default parameters
}
options(digits=10)
summary(as.numeric(volumes))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  976.5442 1028.4148 1059.5077 1046.4744 1070.8770 1085.9729
volumes <- list()
for (i in 1:10) {
  volumes[[i]] <- volume(PH, error=0.5)
}
summary(as.numeric(volumes))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  988.011 1010.110 1023.105 1017.331 1026.995 1031.262

Deterministic algorithms for volume are limited to low dimensions (e.g. less than \(15\))

library(geometry)

P = GenRandVpoly(15, 20)
# this will return an error about memory allocation, i.e. the dimension is too high for qhull
#tim1 <- system.time({ geom_values = convhulln(P$V, options = 'FA') })

#warning: this also takes a lot of time in v1.0.3
#time <- system.time({ vol <- volume(P) })
#print(vol)
#print(time)

Volume of Birkhoff polytopes

We now continue with a more interesting example, the 10-th Birkhoff polytope. It is known from https://arxiv.org/pdf/math/0305332.pdf that its volume equals

\(\text{vol}(\mathcal{B}_{10}) = \frac{727291284016786420977508457990121862548823260052557333386607889}{828160860106766855125676318796872729344622463533089422677980721388055739956270293750883504892820848640000000}\)

obtained via massive parallel computation.

library(volesti)
B10 <- fileToMatrix('data/birk10.ine') #this is actually file to Polytope
exact <- 727291284016786420977508457990121862548823260052557333386607889/828160860106766855125676318796872729344622463533089422677980721388055739956270293750883504892820848640000000

# warning the following will take around half an hour
#time <- system.time({ vol <- volume(B10, Algo = 'CG') })
#print(vol)
#print(time)

Compare our computed estimation with the “normalized” floating point version of \(\text{vol}(\mathcal{B}_{10})\)

n <- 10
vol_B10 <- 727291284016786420977508457990121862548823260052557333386607889/828160860106766855125676318796872729344622463533089422677980721388055739956270293750883504892820848640000000
print(vol_B10/(n^(n-1)))
## [1] 8.782005031e-55

Rounding

We generate skinny polytopes, in particular skinny cubes of the form \(\{x=(x_1,\dots,x_d)\ |\ x_1\leq 100, x_1\geq-100,x_i\leq 1,x_i\geq-1,x_i\in \mathbb{R}, \text{ for } i=2,\dots,d\}\).

Our random walks perform poorly on those polytopes especially as the dimension increases. Note that if we use the CDHR walk here is cheating since we take advantage of the instance structure.

library(ggplot2)

P = GenSkinnyCube(2)
points1 = sample_points(P, WalkType = "CDHR", N=1000)
ggplot(data.frame(x = c(points1[1,]), y = c(points1[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed(ylim =
c(-10,10))

points1 = sample_points(P, WalkType = "RDHR", N=1000)
ggplot(data.frame(x = c(points1[1,]), y = c(points1[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed(ylim =
c(-10,10))

P = GenSkinnyCube(10)
points1 = sample_points(P, WalkType = "CDHR", N=1000)
ggplot(data.frame(x = c(points1[1,]), y = c(points1[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed(xlim =
c(-100,100), ylim = c(-10,10))

points1 = sample_points(P, WalkType = "RDHR", N=1000)
ggplot(data.frame(x = c(points1[1,]), y = c(points1[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed(xlim =
c(-100,100), ylim = c(-10,10))

P = GenSkinnyCube(100)
points1 = sample_points(P, WalkType = "RDHR", N=1000)
ggplot(data.frame(x = c(points1[1,]), y = c(points1[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed(xlim =
c(-100,100), ylim = c(-10,10))

Then we examine the problem of rounding by sampling in the original and then in the rounded polytope and look at the effect in volume computation.

library(ggplot2)

d <- 10

P = GenSkinnyCube(d)
points1 = sample_points(P, WalkType = "CDHR", N=1000)
ggplot(data.frame(x = c(points1[1,]), y = c(points1[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed(ylim =
c(-10,10))

P <- rand_rotate(P)$P

points1 = sample_points(P, WalkType = "RDHR", N=1000)
ggplot(data.frame(x = c(points1[1,]), y = c(points1[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed()

exact <- 2^d*100
cat("exact volume                 = ", exact , "\n")
## exact volume                 =  102400
cat("volume estimation (no round) = ", volume(P, WalkType = "RDHR",
rounding=FALSE), "\n")
## volume estimation (no round) =  20941.01857
cat("volume estimation (rounding) = ", volume(P, WalkType = "RDHR",
rounding=TRUE), "\n")
## volume estimation (rounding) =  50753.25401
# 1st step of rounding
res1 = round_polytope(P)
points2 = sample_points(res1$P, WalkType = "RDHR", N=1000)
ggplot(data.frame(x = c(points2[1,]), y = c(points2[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed()

volesti <- volume(res1$P) * res1$round_value
cat("volume estimation (1st step) = ", volesti, " rel. error=",
abs(exact-volesti)/exact,"\n")
## volume estimation (1st step) =  101244.6802  rel. error= 0.01128241946
# 2nd step of rounding
res2 = round_polytope(res1$P)
points2 = sample_points(res2$P, WalkType = "RDHR", N=1000)
ggplot(data.frame(x = c(points2[1,]), y = c(points2[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed()

volesti <- volume(res2$P) * res1$round_value * res2$round_value
cat("volume estimation (2nd step) = ", volesti, " rel. error=",
abs(exact-volesti)/exact,"\n")
## volume estimation (2nd step) =  106821.7953  rel. error= 0.04318159434
# 3rd step of rounding
res3 = round_polytope(res2$P)
points2 = sample_points(res3$P, WalkType = "RDHR", N=1000)
ggplot(data.frame(x = c(points2[1,]), y = c(points2[2,])), aes(x=x,
y=y)) + geom_point() +labs(x =" ", y = " ")+coord_fixed()

volesti <- volume(res3$P) * res1$round_value * res2$round_value *
res3$round_value
cat("volume estimation (3rd step) = ", volesti, " rel. error=",
abs(exact-volesti)/exact,"\n")
## volume estimation (3rd step) =  97000.54463  rel. error= 0.05272905635

Integration

We can use sampling and volume estimation to estimate integrals over polyhedral domains. Below there is an example with a degree 2 polynomial over a 3-dimensional cube.

library(cubature) # load the package "cubature"
f <- function(x) { 2/3 * (2 * x[1]^2 + x[2] + x[3]) + 10 }  # "x" is vector
adaptIntegrate(f, lowerLimit = c(-1, -1, -1), upperLimit = c(1, 1, 1))$integral
## [1] 83.55555556
# Simple Monte Carlo integration
# https://en.wikipedia.org/wiki/Monte_Carlo_integration
P = gen_cube(3, 'H')
num_of_points <- 10000
points1 <- sample_points(P, WalkType = "RDHR", walk_step = 100, N=num_of_points)
int<-0
for (i in 1:num_of_points){
  int <- int + f(points1[,i])
}
V <- volume(P)
print(int*V/num_of_points)
## [1] 80.39597527

Counting linear extensions

Let \(G= (V, E)\) be an acyclic digraph with \(V= [n] :=\{1,2, . . . , n\}\). One might want to consider \(G\) as a representation of the partially ordered set (poset) \(V:i > j\) if and only if there is a directed path from node \(i\) to node \(j\).A permutation \(\pi\) of \([n]\) is called a linear extension of \(G\) (or the associated poset \(V\)) if \(\pi^{-1}(i)> \pi^{-1}(j)\) for every edge \(i\rightarrow j \in E\).

Let \(P_{LE}(G)\) be the polytope in \(R^n\) defined by \(P_{LE}(G) =\{x\in R^n\ |\ 1\geq x_i \geq 0 \text{ for all }i=1,2,\dots ,n\), and \(x_i\geq x_j\) for all directed edges \(i\rightarrow j \in E\).

A well known result is \(\#_{LE}G=vol(P)*n!\).

The following example from https://inf.ethz.ch/personal/fukudak/lect/pclect/notes2016/expoly_order.pdf has \(9\) linear extensions. See also https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/16957/15838 for counting linear extensions in practice.

graph

graph

We can approximate this number by the following code.

A = matrix(c(-1,0,1,0,0,0,-1,1,0,0,0,-1,0,1,0,0,0,0,-1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,-1,0,0,0,0,0,-1,0,0,0,0,0,-1,0,0,0,0,0,-1,0,0,0,0,0,-1),
ncol=5, nrow=14, byrow=TRUE)
b = c(0,0,0,0,0,0,0,0,0,1,1,1,1,1)
P = Hpolytope$new(A, b)
volume(P,error=0.2)*factorial(5)
## [1] 9.07765462

Financial modelling

For a specific set of assets in a stock market, portfolios are characterized by their return and their risk which is the variance of the portfolios’ returns (volatility). Copulas are approximations of the bivariate joint distribution of return and volatility.

#install.packages('plotly')
library('plotly')
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
MatReturns <- read.table("https://stanford.edu/class/ee103/data/returns.txt", sep=",")
MatReturns = MatReturns[-c(1,2),]
dates = as.character(MatReturns$V1)
MatReturns = as.matrix(MatReturns[,-c(1,54)])
MatReturns = matrix(as.numeric(MatReturns [,]),nrow = dim(MatReturns )[1], ncol = dim(MatReturns )[2], byrow = FALSE)

#starting_date = "2008-12-18" ##crisis period
#stopping_date = "2009-03-13"

starting_date = "2007-03-07" ##normal period
stopping_date = "2007-05-31"

row1 = which(dates %in% starting_date)
row2 = which(dates %in% stopping_date)

nassets = dim(MatReturns)[2]
W=row1:row2

compRet = rep(1,nassets)
for (j in 1:nassets) {
  for (k in W) {
    compRet[j] = compRet[j] * (1 + MatReturns[k, j])
  }
  compRet[j] = compRet[j] - 1
}

mass = copula2(h = compRet, E = cov(MatReturns[row1:row2,]), numSlices = 100, N=1000000)

plot_ly(z = ~mass) %>% add_surface(showscale=FALSE)