-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodeling-zero-day-malware-spread.html
More file actions
528 lines (470 loc) · 43.8 KB
/
modeling-zero-day-malware-spread.html
File metadata and controls
528 lines (470 loc) · 43.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="author" content="Roman Sinayev" />
<meta name="copyright" content="Roman Sinayev" />
<meta property="og:type" content="article" />
<meta name="twitter:card" content="summary">
<meta name="keywords" content="modeling, malware, malware, " />
<meta property="og:title" content="Modeling Zero Day Malware Spread "/>
<meta property="og:url" content="http://lqdc.github.io/modeling-zero-day-malware-spread.html" />
<meta property="og:description" content="All models are wrong, but some are useful. -- George E. P. Box Does installing an antivirus that catches 2% more malware really make a difference? Should antiviruses try to concentrate on catching yet unknown malware requiring a small delay or is it acceptable if they primarily rely on simpler and ..." />
<meta property="og:site_name" content="lqdc blog" />
<meta property="og:article:author" content="Roman Sinayev" />
<meta property="og:article:published_time" content="2015-03-09T03:03:00-07:00" />
<meta name="twitter:title" content="Modeling Zero Day Malware Spread ">
<meta name="twitter:description" content="All models are wrong, but some are useful. -- George E. P. Box Does installing an antivirus that catches 2% more malware really make a difference? Should antiviruses try to concentrate on catching yet unknown malware requiring a small delay or is it acceptable if they primarily rely on simpler and ...">
<title>Modeling Zero Day Malware Spread · lqdc blog
</title>
<link href="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.2/css/bootstrap-combined.min.css" rel="stylesheet">
<link href="//netdna.bootstrapcdn.com/font-awesome/4.0.1/css/font-awesome.css" rel="stylesheet">
<link rel="stylesheet" type="text/css" href="http://lqdc.github.io/theme/css/pygments.css" media="screen">
<link rel="stylesheet" type="text/css" href="http://lqdc.github.io/theme/tipuesearch/tipuesearch.css" media="screen">
<link rel="stylesheet" type="text/css" href="http://lqdc.github.io/theme/css/elegant.css" media="screen">
<link rel="stylesheet" type="text/css" href="http://lqdc.github.io/theme/css/custom.css" media="screen">
<link href="http://lqdc.github.io/feed.xml" type="application/atom+xml" rel="alternate" title="lqdc blog - Full Atom Feed" />
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-60488389-1', 'auto');
ga('send', 'pageview');
</script>
<link rel="stylesheet" href="http://lqdc.github.io/css/nv.d3.css" type="text/css" />
<link rel="stylesheet" href="http://lqdc.github.io/css/lotka.css" type="text/css" />
</head>
<body>
<div id="content-sans-footer">
<div class="navbar navbar-static-top">
<div class="navbar-inner">
<div class="container-fluid">
<a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</a>
<a class="brand" href="http://lqdc.github.io/"><span class=site-name>lqdc blog</span></a>
<div class="nav-collapse collapse">
<ul class="nav pull-right top-menu">
<li ><a href="http://lqdc.github.io">Home</a></li>
<li ><a href="http://lqdc.github.io/categories.html">Categories</a></li>
<li ><a href="http://lqdc.github.io/tags.html">Tags</a></li>
<li ><a href="http://lqdc.github.io/archives.html">Archives</a></li>
<li><form class="navbar-search" action="http://lqdc.github.io/search.html" onsubmit="return validateForm(this.elements['q'].value);"> <input type="text" class="search-query" placeholder="Search" name="q" id="tipue_search_input"></form></li>
</ul>
</div>
</div>
</div>
</div>
<div class="container-fluid">
<div class="row-fluid">
<div class="span1"></div>
<div class="span10">
<article>
<div class="row-fluid">
<header class="page-header span10 offset2">
<h1><a href="http://lqdc.github.io/modeling-zero-day-malware-spread.html"> Modeling Zero Day Malware Spread </a></h1>
</header>
</div>
<div class="row-fluid">
<div class="span8 offset2 article-content">
<blockquote>
<p>All models are wrong, but some are useful.</p>
<p>-- <cite>George E. P. Box</cite></p>
</blockquote>
<p>Does installing an antivirus that catches 2% more malware really make a difference? Should antiviruses try to concentrate on catching yet unknown malware requiring a small delay or is it acceptable if they primarily rely on simpler and faster signature-based detection? These and many other questions can be answered by modeling the behavior of malware and later mapping it to existing malware infection data.</p>
<p>In this post we will try to apply a modeling technique commonly used for epidemic and population modeling to predict the spread of malware. However, first we should go over a simple application of differential equations to model monthly fluctuations in populations of foxes and rabbits, because although the application is very different, the equations are not.</p>
<h2>Foxes and Rabbits</h2>
<p><img alt="Fox and Rabbit by Phil Haynes" src="http://lqdc.github.io/images/rabbits-foxes/fox_rabbit.jpg" /></p>
<p>First, let’s establish what we are trying to model. We are interested in figuring out how the populations of foxes and rabbits fluctuate over time. We know that rabbits would multiply exponentially if there were no predators. We know that for foxes to grow and flourish there need to be some rabbits around as they are the food source. We also know that both foxes and rabbits can die of old age. There are many other things that are true about foxes and rabbits, but let’s say that these are the main contributors to the population dynamics.</p>
<p>Luckily for us, there already exists a pair of equations that can be used to model just this scenario. They are called Lotka-Volterra equations and different variations of these equations are used in ecology and epidemiology to model animal population dynamics and disease proliferation.</p>
<p>Without further ado, the equations are below:</p>
<div class="math">$$
\begin{align}
\frac{dx}{dt}&=x (a-by-c)\\
\frac{dy}{dt}&=y (ebx-\delta)
\end{align}
$$</div>
<p>In the first part, we see that <span class="math">\(dx/dt\)</span>, or change in <span class="math">\(x\)</span> (the rabbit population) over time, is assumed to grow (<span class="math">\(xa\)</span>) unless subject to predation (<span class="math">\(-bxy\)</span>) and rabbits dying of old age (<span class="math">\(-cx\)</span>).</p>
<p>In the second part, we see that <span class="math">\(dy/dt\)</span>, or change of <span class="math">\(y\)</span> (the fox population) over time is dictated by the rate of rabbit consumption (<span class="math">\(ebxy\)</span>) and natural death of foxes over time (<span class="math">\(\delta y\)</span>). A more detailed explanation is found on the <a href="http://en.wikipedia.org/wiki/Lotka%E2%80%93Volterra_equation#Physical_meanings_of_the_equations">equation wiki page</a>.</p>
<p>Integrating these analytically can be daunting especially if there are more equations and variables. We'll use a simple numerical integration technique called <a href="http://en.wikipedia.org/wiki/Midpoint_method">Midpoint Method</a> to integrate the equation numerically and actually observe these trends.</p>
<p>A future point depends only on the previous point. So the population of rabbits tomorrow can be estimated from knowing today's populations of both foxes and rabbits.</p>
<p>In Python, to calculate <span class="math">\(x_{t+1}\)</span> and <span class="math">\(y_{t+1}\)</span> for each day <span class="math">\(t\)</span> in <span class="math">\(\{0..730\}\)</span>, we can do the following:</p>
<div class="highlight"><pre><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pylab</span> <span class="k">as</span> <span class="nn">pl</span>
<span class="k">def</span> <span class="nf">calc_lv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">coefs</span><span class="p">):</span>
<span class="n">x_new</span> <span class="o">=</span> <span class="n">x</span> <span class="o">*</span> <span class="p">(</span><span class="n">coefs</span><span class="p">[</span><span class="s">'a'</span><span class="p">]</span> <span class="o">-</span> <span class="n">coefs</span><span class="p">[</span><span class="s">'b'</span><span class="p">]</span> <span class="o">*</span> <span class="n">y</span> <span class="o">-</span> <span class="n">coefs</span><span class="p">[</span><span class="s">'c'</span><span class="p">])</span>
<span class="n">y_new</span> <span class="o">=</span> <span class="n">y</span> <span class="o">*</span> <span class="p">(</span><span class="n">coefs</span><span class="p">[</span><span class="s">'e'</span><span class="p">]</span> <span class="o">*</span> <span class="n">coefs</span><span class="p">[</span><span class="s">'b'</span><span class="p">]</span> <span class="o">*</span> <span class="n">x</span> <span class="o">-</span> <span class="n">coefs</span><span class="p">[</span><span class="s">'d'</span><span class="p">])</span>
<span class="k">return</span> <span class="n">x_new</span><span class="p">,</span> <span class="n">y_new</span>
<span class="k">def</span> <span class="nf">midpoint_method_lotka</span><span class="p">(</span><span class="n">t</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">coefs</span><span class="p">,</span> <span class="n">dt</span><span class="p">,</span> <span class="n">N</span><span class="p">):</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">N</span><span class="p">):</span>
<span class="n">dx1</span><span class="p">,</span> <span class="n">dy1</span> <span class="o">=</span> <span class="n">calc_lv</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">coefs</span><span class="p">)</span>
<span class="n">dx2</span><span class="p">,</span> <span class="n">dy2</span> <span class="o">=</span> <span class="n">calc_lv</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">dt</span> <span class="o">*</span> <span class="n">dx1</span> <span class="o">/</span> <span class="mi">2</span><span class="p">,</span> <span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">dt</span> <span class="o">*</span> <span class="n">dy1</span> <span class="o">/</span> <span class="mi">2</span><span class="p">,</span> <span class="n">coefs</span><span class="p">)</span>
<span class="n">x</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">dt</span> <span class="o">*</span> <span class="n">dx2</span>
<span class="n">y</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">dt</span> <span class="o">*</span> <span class="n">dy2</span>
<span class="n">t</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="n">dt</span>
<span class="k">return</span> <span class="n">t</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span>
<span class="k">def</span> <span class="nf">do_lotka</span><span class="p">():</span>
<span class="n">a</span> <span class="o">=</span> <span class="mf">0.04</span>
<span class="n">b</span> <span class="o">=</span> <span class="mf">0.0005</span>
<span class="n">c</span> <span class="o">=</span> <span class="mf">0.0001</span>
<span class="n">d</span> <span class="o">=</span> <span class="mf">0.2</span>
<span class="n">e</span> <span class="o">=</span> <span class="mf">0.1</span>
<span class="n">dt</span> <span class="o">=</span> <span class="mf">0.01</span>
<span class="n">days</span> <span class="o">=</span> <span class="mi">730</span>
<span class="n">N</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">days</span> <span class="o">/</span> <span class="n">dt</span><span class="p">)</span>
<span class="n">coefs</span> <span class="o">=</span> <span class="p">{</span><span class="s">'a'</span><span class="p">:</span> <span class="n">a</span><span class="p">,</span> <span class="s">'b'</span><span class="p">:</span> <span class="n">b</span><span class="p">,</span> <span class="s">'c'</span><span class="p">:</span> <span class="n">c</span><span class="p">,</span> <span class="s">'d'</span><span class="p">:</span> <span class="n">d</span><span class="p">,</span> <span class="s">'e'</span><span class="p">:</span> <span class="n">e</span><span class="p">}</span>
<span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">t</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">3</span><span class="p">,</span> <span class="n">N</span> <span class="o">+</span> <span class="mi">1</span><span class="p">))</span>
<span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">=</span> <span class="mi">200</span> <span class="c"># Initial prey data</span>
<span class="n">y</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">=</span> <span class="mi">50</span> <span class="c"># Initial predator data</span>
<span class="n">t</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">midpoint_method_lotka</span><span class="p">(</span><span class="n">t</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">coefs</span><span class="p">,</span> <span class="n">dt</span><span class="p">,</span> <span class="n">N</span><span class="p">)</span>
<span class="n">pl</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">t</span><span class="p">,</span> <span class="n">x</span><span class="o">/</span><span class="mi">10</span><span class="p">,</span> <span class="s">'g'</span><span class="p">,</span> <span class="n">t</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="s">'r'</span><span class="p">)</span>
<span class="n">pl</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
<p>Implementing it in JavaScript results in the following graph where you can play with the coefficients:</p>
<!--<style>table{table-layout:auto;}</style>-->
<div id="lv_midpoint" class="d3-special">
<svg></svg>
</div>
<div "text-align: right;">
<div style="float: left;">Rabbit Growth (a) = <span id="aValue-value"></span><input type="number" min="0" max="1.0" step="0.005" value="0.04" id="aValueLotka"></div>
<div style="float: right;"> Fox Predation (b) = <span id="bValue-value"></span><input type="number" min="0" max="1.0" step="0.0001" value="0.0005" id="bValueLotka"></div>
<div style="clear: both;"></div>
<div style="float: left;">Rabbit Natural Death (c) = <span id="cValue-value"></span><input type="number" min="0" max="1.0" step="0.0001" value="0.0001" id="cValueLotka"></div>
<div style="float:right;">Fox Natural Death (delta) = <span id="dValue-value"></span><input type="number" min="0" max="1.0" step="0.01" value="0.05" id="dValueLotka"></div>
<div style="clear: both;"></div>
<div style="float: left;">Fox Growth Multiplier (e) = <span id="eValue-value"></span><input type="number" min="0" max="1.0" step="0.01" value="0.1" id="eValueLotka"></div>
<div style="float: right;">Starting Rabbit Num (x_0) = <span id="x0Value-value"></span><input type="number" min="1" max="100000" step="1" value="200" id="x0ValueLotka"></div>
<div style="clear: both;"></div>
<div style="float: left;">Starting Fox Num (y_0) = <span id="y0Value-value"></span><input type="number" min="1" max="100000" step="1" value="50" id="y0ValueLotka"></div>
</div>
<div style="clear: both; margin-bottom: 20px;"></div>
<p>From the plots you can see that the populations are oscillating. The foxes die off when there are no rabbits. Since there are fewer foxes, rabbits’ population increases. As soon as the rabbit population starts increasing, the fox population also starts increasing. A larger number of foxes eats rabbits faster than they multiply, so rabbit population drops and the cycle continues.</p>
<p>This demonstrates that if we know (or make assumptions about) some rules of interaction between distinct populations, then we can write down some differential equations and analyze the dynamics of these populations.</p>
<h2>Onto Malware</h2>
<p>To model the spread of malware, we will use the same paradigm but design it a little differently to reflect the nature of malware proliferation. Specifically, we will simply use our knowledge of how malware can spread to design equations that allow us to model and analyze the dynamics of its proliferation.</p>
<p>We are going to model a network of computers that periodically talk to each other. The network can be a TCP/IP network or just some computers that share information via USB drives.</p>
<p>Vulnerable computers can be infected with 2 types of malware – known malware and so-called 0-Day malware. The distinction here is that AV companies have signatures for known malware.</p>
<p>Once a computer is infected and that infection is detected, it can be quarantined. Here quarantined means that a sysadmin has isolated it from the rest of the network. After the computer is quarantined it is disinfected and then put back on the network.</p>
<p>Antivirus companies periodically update their signatures, so computers infected with unidentified malware eventually get converted to an infection with identified known malware, i.e. malware they were infected with can become “known”.
Malware writers are more likely to write malware if there are a large number of known-vulnerable computers and some number of active malware already in the wild. This way the effect of each malware would be amplified and they could improve upon extant malware.</p>
<p>Malware periodically stops working and bad domain/IP lists are periodically updated to block C&C server traffic.
0-Day malware is harder to write, but it is probably written better and has a higher chance of succeeding.</p>
<p>We can express these ideas with differential equations. One way to do it is the following:</p>
<div class="math">$$
\begin{align}
\frac{dV}{dt}&=-\epsilon V^k(1-e^{-a_0 Z}) + bQ -c V^k(1-e^{-a W})\\
\frac{dI}{dt}&=c V^k(1-e^{-aW}) - fI_t + h U\\
\frac{dQ}{dt}&= fI - bQ + gU\\
\frac{dU}{dt}&= \epsilon V^k (1-e^{-a_0 Z}) - U (g+h)\\
\frac{dW}{dt}&= l (1-e^{-p W}) V^k - m W\\
\frac{dZ}{dt}&=n (1-e^{-p_0 Z})V^k - o Z\\
\end{align}
$$</div>
<ol>
<li><span class="math">\(V\)</span> or the <strong>Vulnerable</strong> population is the normal population of computers within a network that have a potential to get malware.</li>
<li><span class="math">\(Q\)</span> or the <strong>Quarantined</strong> population are the computers that have been quarantined after a sysadmin found some malware on the machine and decided to take it off the network so that it doesn’t infect other computers.</li>
<li><span class="math">\(I\)</span> or the Computers <strong>Infected</strong> with malware.</li>
<li><span class="math">\(U\)</span> or the Computers infected with <strong>Unidentified</strong> malware/0-Days.</li>
<li><span class="math">\(W\)</span> or the regular <strong>Worm</strong>/Malware C&C server population is the number of known malware in the wild.</li>
<li><span class="math">\(Z\)</span> or the <strong>0-Day Worm</strong>/Malware C&C population.</li>
</ol>
<p>The coefficients mean the following:</p>
<p><span class="math">\(c\)</span> and <span class="math">\(\epsilon\)</span> are the probabilities that a given known malware and 0-Day malware, respectively, would be functional and not immediately detected.</p>
<p><span class="math">\(a\)</span> and <span class="math">\(a_0\)</span> are the coefficients of expected infected computers where <span class="math">\(aW\)</span> and <span class="math">\(a_0 Z\)</span> are the expected number of new infected computers for regular malware and 0-Days, respectively.</p>
<p><span class="math">\(f\)</span> is the catch rate of known malware when it already exists on a machine.</p>
<p><span class="math">\(h\)</span> is the fraction of unknown malware that is discovered and labeled as malware by AV companies every day.</p>
<p><span class="math">\(b\)</span> is the fraction of quarantined computers that become disinfected and go back on the network. This is really the response time of a SysAdmin/IT forensics in dealing with known infections.</p>
<p><span class="math">\(g\)</span> is the catch rate of unknown malware. This is in the same role as <span class="math">\(f\)</span>, but here no signatures exist.</p>
<p><span class="math">\(k\)</span> is the clustering coefficient for computers. That is, within a set of computers that share information among themselves, it is possible that there are clusters of computers that are more connected (e.g.: sharing USB drives).</p>
<p><span class="math">\(l\)</span> and <span class="math">\(n\)</span> are the coefficients of malware and 0-Day malware growth.</p>
<p><span class="math">\(m\)</span> and <span class="math">\(o\)</span> are the coeffients for reduction in number of C&C servers. These could either represent malware no longer working because it is not compatible with system updates or C&C servers getting blacklisted.</p>
<p><span class="math">\(p\)</span> and <span class="math">\(p_0\)</span> are the coefficients of expected number of C&C servers where <span class="math">\(pW\)</span> and <span class="math">\(p_0Z\)</span> are the expected number of new C&C servers for regular malware and 0-Days, respectively.</p>
<blockquote>
<p><strong>Note:</strong>
<span class="math">\(e^{-aW}\)</span> represents the fraction of computers <strong>not</strong> infected according to a <a href="http://en.wikipedia.org/wiki/Poisson_distribution" title="http://en.wikipedia.org/wiki/Poisson_distribution">Poisson distribution</a> with a value of <span class="math">\(0\)</span>. That is, <span class="math">\(\frac{\lambda^k e^{-\lambda}}{k!}\)</span> where <span class="math">\(k=0\)</span> and <span class="math">\(\lambda=a\)</span>. Since there are <span class="math">\(W\)</span> malware in the wild, the equation becomes <span class="math">\(e^{-a^W}\)</span> or <span class="math">\(e^{-aW}\)</span>. Therefore <span class="math">\(1-e^{-aW}\)</span> represents the fraction that <strong>is</strong> infected. <span class="math">\(aW\)</span> then represents the Poisson parameter of expected number of computers infected per day. We use a Poisson distribution here because it can be used to predict frequencies of occurrences of rare events.</p>
</blockquote>
<p>This version of the model assumes the following:</p>
<ol>
<li>Infected computers cannot infect others directly.</li>
<li>All vulnerable computers are equally likely to be infected.</li>
<li>There is only one infection per computer.</li>
<li>Probabilities of infection are given by a Poisson distribution.</li>
<li>One malware entity per C&C server.</li>
<li>Total number of computers remains the same.</li>
</ol>
<p>To integrate these numerically, we will use midpoint method again. You can find the python implementation for that <a href="https://gist.githubusercontent.com/lqdc/b171c280fbf543c7831c/raw/48f119f7e8c54efe0cb4487268c31dbd97198659/model.py">here</a>. It is also implemented/plotted below in JS, so you can play with the coefficients of the malware diagram here:</p>
<div id="ml_computers" class="d3-special">
<svg></svg>
</div>
<div>
<div class="left-float">a = <span id="aValMal-value"></span><input type="number" min="0" max="1.0" step="0.0001" value="0.0001" id="aValMal">
</div>
<div class="left-float">a_0 = <span id="a0ValMal-value"></span><input type="number" min="0" max="1.0" step="0.001" value="0.001" id="a0ValMal">
</div>
<div class="left-float">b = <span id="bValMal-value"></span><input type="number" min="0" max="1.0" step="0.01" value="1.0" id="bValMal">
</div>
<div class="left-float">c = <span id="cValMal-value"></span><input type="number" min="0" max="1.0" step="0.01" value="0.2" id="cValMal">
</div>
<div class="left-float">f = <span id="fValMal-value"></span><input type="number" min="0" max="1.0" step="0.01" value="0.9" id="fValMal"></div>
<div class="left-float">epsilon = <span id="eValMal-value"></span><input type="number" min="0" max="1.0" step="0.1" value="0.8" id="eValMal">
</div>
<div class="left-float">g = <span id="gValMal-value"></span><input type="number" min="0" max="1.0" step="0.1" value="0.2" id="gValMal">
</div>
<div class="left-float">
h = <span id="hValMal-value"></span><input type="number" min="0" max="1.0" step="0.1" value="1.0" id="hValMal">
</div>
<div class="left-float">
k = <span id="kValMal-value"></span><input type="number" min="0" max="3.0" step="0.1" value="1.0" id="kValMal">
</div>
<div class="left-float">
l = <span id="lValMal-value"></span><input type="number" min="0" max="1.0" step="0.1" value="1.0" id="lValMal">
</div>
<div class="left-float">
m = <span id="mValMal-value"></span><input type="number" min="0" max="1.0" step="0.1" value="0.3" id="mValMal">
</div>
<div class="left-float">
n = <span id="nValMal-value"></span><input type="number" min="0" max="1.0" step="0.1" value="0.1" id="nValMal">
</div>
<div class="left-float">
o = <span id="oValMal-value"></span><input type="number" min="0" max="1.0" step="0.1" value="0.3" id="oValMal">
</div>
<div class="left-float">
p = <span id="pValMal-value"></span><input type="number" min="0" max="1.0" step="0.01" value="0.05" id="pValMal">
</div>
<div class="left-float">
p_0 = <span id="pValMal-value"></span><input type="number" min="0" max="1.0" step="0.01" value="0.08" id="pValMal">
</div>
<div class="left-float">
W_0 = <span id="W0ValMal-value"></span><input type="number" min="0" max="1000000" step="1" value="100" id="W0ValMal">
</div>
<div class="left-float">
Z_0 = <span id="Z0ValMal-value"></span><input type="number" min="0" max="1000000" step="1" value="10" id="Z0ValMal">
</div>
<div class="left-float">
V_0 = <span id="V0ValMal-value"></span><input type="number" min="1" max="100000" step="1" value="989" id="V0ValMal">
</div>
<div class="left-float">
Q_0 = <span id="Q0ValMal-value"></span><input type="number" min="0" max="100000" step="1" value="0" id="Q0ValMal">
</div>
<div class="left-float">
I_0 = <span id="I0ValMal-value"></span><input type="number" min="0" max="100000" step="1" value="10" id="I0ValMal">
</div>
<div class="left-float">
U_0 = <span id="U0ValMal-value"></span><input type="number" min="0" max="100000" step="1" value="1" id="U0ValMal">
</div>
<p></div>
<div style="clear: both;"></div></p>
<div id="ml_cnc" class="d3-special">
<svg></svg>
</div>
<h3>Meaning and Implications</h3>
<p>To figure out what the model implies and represents we will set some coefficients to zero or 1.</p>
<p>Setting <span class="math">\(c\)</span> and <span class="math">\(l\)</span> to zero removes the effects of known malware, while setting <span class="math">\(\epsilon\)</span> and <span class="math">\(n\)</span> to zero removes the effects of 0-Days.</p>
<p><img alt="No 0-Days." src="http://lqdc.github.io/images/rabbits-foxes/e0n0.png" title="No 0-Days." /></p>
<p>If we set <span class="math">\(b\)</span> to zero, this could represent a very lazy IT department that never fixes infected quarantined computers.</p>
<p><img alt="Lazy IT/forensics department." src="http://lqdc.github.io/images/rabbits-foxes/b0.png" title="Lazy IT/forensics department" /> On the other hand, setting <span class="math">\(b\)</span> to <span class="math">\(1\)</span>, can represent a very active IT department that fixes all infected computers within a day.</p>
<p>If we set <span class="math">\(f\)</span> and <span class="math">\(g\)</span> to <span class="math">\(1\)</span>, we have a situation where all malware is detected after just one day of being in the wild.</p>
<p><img alt="Malware detected within one day." src="http://lqdc.github.io/images/rabbits-foxes/f1g1.png" title="Malware detected within one day." /></p>
<p>This could be a future sandbox that takes some time to reach its conclusion, but always finds the malware. Finally, if we set <span class="math">\(f\)</span> and/or <span class="math">\(g\)</span> to zero, this would represent an incompetent antivirus/sandbox that never detects anything if it gets past the initial defenses.</p>
<p><img alt="Incompetent antivirus/sandbox." src="http://lqdc.github.io/images/rabbits-foxes/f0g0.png" title="Incompetent antivirus/sandbox." /></p>
<p>Since <span class="math">\(o\)</span> and <span class="math">\(m\)</span> are rates of malware C&C servers being blocked, setting them to zero results in unabated malware growth.</p>
<p><img alt="No detection of C&C Servers hosting malware." src="http://lqdc.github.io/images/rabbits-foxes/o0m0.png" title="No detection of C&C Servers hosting malware." />
At the opposite end of the spectrum, setting them to <span class="math">\(1\)</span> might represent very active Domain/IP blocking where C&C servers disappear within a day after being active.</p>
<p><img alt="C&C servers detected within a day of being active." src="http://lqdc.github.io/images/rabbits-foxes/o1m1.png" title="C&C servers detected within a day of being active." /></p>
<h3>Without 0-Days</h3>
<p>If we set <span class="math">\(\epsilon\)</span> and <span class="math">\(n\)</span> to zero, we are ignoring the effect of 0-Day malware and associated C&C servers. Thus we are left with
</p>
<div class="math">$$
\begin{align}
\frac{dV}{dt}&= bQ -c V^k(1-e^{-a W})\\
\frac{dI}{dt}&= cV^k(1-e^{-aW}) - fI\\
\frac{dQ}{dt}&= fI - bQ\\
\frac{dW}{dt}&= l (1-e^{-p W}) V^k - m W\\
\end{align}
$$</div>
<p>
Here, <span class="math">\(c\)</span> can be viewed as a coefficient that represents the fraction of malware that is <strong>not</strong> detected every day. Therefore, <span class="math">\(0\)</span> corresponds to</p>
<p><img alt="No 0-Days and all known malware blocked immediately." src="http://lqdc.github.io/images/rabbits-foxes/e0n0c0.png" title="No 0-Days and all known malware blocked immediately." />
and <span class="math">\(1\)</span> corresponds to malware not getting detected initially.</p>
<p><img alt="No 0-Days and malware not initially detected." src="http://lqdc.github.io/images/rabbits-foxes/e0n0c1.png" title="No 0-Days and malware not initially detected." />
Setting <span class="math">\(c\)</span> to <span class="math">\(1\)</span>, and <span class="math">\(f\)</span> to <span class="math">\(1\)</span> and leaving the rest of the coefficients at default values, we can see that some computers are still infected.</p>
<p><img alt="No zero days and malware not immediately detected, but always detected within a day." src="http://lqdc.github.io/images/rabbits-foxes/e0n0c1f1.png" title="No zero days and malware not immediately detected, but always detected within a day." /></p>
<p>What this implies is that the malware gets on some computers and is wiped within a day. We are still left with a constant ≈<span class="math">\(10\%\)</span> portion of computers being infected. However, if <span class="math">\(90\%\)</span> of the malware that gets on is initially detected (<span class="math">\(c=0.1\)</span>), only <span class="math">\(3\%\)</span> of the computers are infected. Therefore, we can gather than initial detection plays a major role in reducing the number of infections.</p>
<p>Keeping <span class="math">\(f=1\)</span> i.e. the detection after <span class="math">\(1\)</span> day at <span class="math">\(100\%\)</span>, we can also answer whether it matters if an antivirus detects <span class="math">\(90\%\)</span> or <span class="math">\(99\%\)</span> of malware immediately. At <span class="math">\(90\%\)</span> detection, <span class="math">\(2\%\)</span> of computers are perpetually infected</p>
<p><img alt="90% immediate detection and full detection after 1 day." src="http://lqdc.github.io/images/rabbits-foxes/e0n0c01f1.png" title="90% immediate detection and full detection after 1 day." />
while at <span class="math">\(99\%\)</span>, none are.</p>
<p><img alt="99% immediate detection and full detection after 1 day" src="http://lqdc.github.io/images/rabbits-foxes/e0n0c001f1.png" title="99% immediate detection and full detection after 1 day" />
So in a world where all malware is known and if an antivirus can tell after a day if a computer is infected, it is indeed better to use an antivirus that detects <span class="math">\(99\%\)</span> than <span class="math">\(90\%\)</span> of the malware immediately.</p>
<h3>Adding 0-Days</h3>
<p>After setting <span class="math">\(\epsilon\)</span> and <span class="math">\(n\)</span> back to their original values, we can explore what happens to the system when we have 0-Day malware and C&C servers and how 0-Day detection affects the number of infected computers.</p>
<p>First, we can look at the effects of <span class="math">\(o\)</span>, the coefficient of detection of C&C servers. Increasing it surprisingly results in an <em>increase</em> in the number of active known C&C servers, but decreases in infected and 0-Day infected computers. This makes sense, because there are more vulnerable computers, so malware authors may be more likely to spread existing malware.</p>
<p>A similar effect can be observed if we increase the detection rate of 0-Day malware (<span class="math">\(g\)</span>) or lower the fraction of working 0-Days (<span class="math">\(\epsilon\)</span>).</p>
<p><img alt="Full detection of 0-Days after one day, and 80% detection immediately" src="http://lqdc.github.io/images/rabbits-foxes/g1e02.png" title="Full detection of 0-Days after one day, and 80% detection immediately" />
The number of C&Cs that control 0-Day and known malware <em>increases</em>, but the number of computers infected with either decreases.</p>
<h3>Lessons learned</h3>
<p>Although when creating the model we made a number of assumptions that may not always be true, these results can still be useful, especially if the model is fit to real data.</p>
<p>Small improvements in detection rate of known and 0-Day malware can make a big difference in the number of users subject to infection. For example, a <span class="math">\(9\%\)</span> increase in the immediate detection can result in almost complete abolition of infected computer population, substantially decreasing the need for forensic analysis and potentially reducing required IT budget. Overall, quicker signature generation (increasing the <span class="math">\(f\)</span> coefficient) and utilizing sandboxing techniques (increasing <span class="math">\(g\)</span> and decreasing <span class="math">\(\epsilon\)</span>) to detect 0-Days can have a profound impact in reducing the number of infected computers, thereby protecting users' data and preventing proliferation of botnets.</p>
<p>Full version of the Python model implementation <a href="https://gist.githubusercontent.com/lqdc/b171c280fbf543c7831c/raw/8d70ca6aee163f1b95736019ddf78147a79cb9f3/model.py">can be found on Github</a>.</p>
<script type="text/javascript">if (!document.getElementById('mathjaxscript_pelican_#%@#$@#')) {
var align = "center",
indent = "0em",
linebreak = "false";
if (false) {
align = (screen.width < 768) ? "left" : align;
indent = (screen.width < 768) ? "0em" : indent;
linebreak = (screen.width < 768) ? 'true' : linebreak;
}
var mathjaxscript = document.createElement('script');
mathjaxscript.id = 'mathjaxscript_pelican_#%@#$@#';
mathjaxscript.type = 'text/javascript';
mathjaxscript.src = '//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
mathjaxscript[(window.opera ? "innerHTML" : "text")] =
"MathJax.Hub.Config({" +
" config: ['MMLorHTML.js']," +
" TeX: { extensions: ['AMSmath.js','AMSsymbols.js','noErrors.js','noUndefined.js'], equationNumbers: { autoNumber: 'AMS' } }," +
" jax: ['input/TeX','input/MathML','output/HTML-CSS']," +
" extensions: ['tex2jax.js','mml2jax.js','MathMenu.js','MathZoom.js']," +
" displayAlign: '"+ align +"'," +
" displayIndent: '"+ indent +"'," +
" showMathMenu: true," +
" tex2jax: { " +
" inlineMath: [ ['\\\\(','\\\\)'] ], " +
" displayMath: [ ['$$','$$'] ]," +
" processEscapes: true," +
" preview: 'TeX'," +
" }, " +
" 'HTML-CSS': { " +
" styles: { '.MathJax_Display, .MathJax .mo, .MathJax .mi, .MathJax .mn': {color: 'inherit ! important'} }," +
" linebreaks: { automatic: "+ linebreak +", width: '90% container' }," +
" }, " +
"}); ";
(document.body || document.getElementsByTagName('head')[0]).appendChild(mathjaxscript);
}
</script>
<section>
<div class="accordion" id="accordion2">
<div class="accordion-group">
<div class="accordion-heading">
<a class="accordion-toggle disqus-comment-count" data-toggle="collapse" data-parent="#accordion2"
href="http://lqdc.github.io/modeling-zero-day-malware-spread.html#disqus_thread">
Comments
</a>
</div>
<div id="disqus_thread" class="accordion-body collapse">
<div class="accordion-inner">
<div class="comments">
<div id="disqus_thread"></div>
<script type="text/javascript">
var disqus_shortname = 'codellama';
var disqus_identifier = 'http://lqdc.github.io/modeling-zero-day-malware-spread.html';
var disqus_url = 'http://lqdc.github.io/modeling-zero-day-malware-spread.html';
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
<noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
<a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
</div>
</div>
</div>
</div>
</div>
</section>
<hr/>
</div>
<section>
<div class="span2" style="float:right;font-size:0.9em;">
<h4>Published</h4>
<time pubdate="pubdate" datetime="2015-03-09T03:03:00-07:00">Mar 9, 2015</time>
<h4>Category</h4>
<a class="category-link" href="http://lqdc.github.io/categories.html#malware-ref">malware</a>
<h4>Tags</h4>
<ul class="list-of-tags tags-in-article">
<li><a href="http://lqdc.github.io/tags.html#malware-ref">malware
<span>5</span>
</a></li>
<li><a href="http://lqdc.github.io/tags.html#modeling-ref">modeling
<span>1</span>
</a></li>
</ul>
<h4>Contact</h4>
<a href="https://github.com/lqdc" title="My github Profile" class="sidebar-social-links" target="_blank">
<i class="fa fa-github sidebar-social-links"></i></a>
<a href="http://rsinayev.com" title="My home Profile" class="sidebar-social-links" target="_blank">
<i class="fa fa-home sidebar-social-links"></i></a>
</div>
</section>
</div>
</article>
</div>
<div class="span1"></div>
</div>
</div>
<div id="push"></div>
</div>
<footer>
<div id="footer">
<ul class="footer-content">
<li class="elegant-power">Powered by <a href="http://getpelican.com/" title="Pelican Home Page">Pelican</a>. Theme: <a href="http://oncrashreboot.com/pelican-elegant" title="Theme Elegant Home Page">Elegant</a> by <a href="http://oncrashreboot.com" title="Talha Mansoor Home Page">Talha Mansoor</a></li>
</ul>
</div>
</footer> <script src="http://code.jquery.com/jquery.min.js"></script>
<script src="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.2/js/bootstrap.min.js"></script>
<script>
function validateForm(query)
{
return (query.length > 0);
}
</script>
<script type="text/javascript">
var disqus_shortname = 'codellama';
(function () {
var s = document.createElement('script'); s.async = true;
s.type = 'text/javascript';
s.src = '//' + disqus_shortname + '.disqus.com/count.js';
(document.getElementsByTagName('HEAD')[0] || document.getElementsByTagName('BODY')[0]).appendChild(s);
}());
</script>
<script language="javascript" type="text/javascript">
function uncollapse() {
if (window.location.hash.match(/^#comment-\d+$/)) {
$('#disqus_thread').collapse('show');
}
}
</script>
<script type="text/javascript" language="JavaScript">
uncollapse();
window.onhashchange=function(){
if (window.location.hash.match(/^#comment-\d+$/))
window.location.reload(true);
}
</script>
<script>
$('#disqus_thread').on('shown', function () {
var link = document.getElementsByClassName('accordion-toggle');
var old_innerHTML = link[0].innerHTML;
$(link[0]).fadeOut(500, function() {
$(this).text('Click here to hide comments').fadeIn(500);
});
$('#disqus_thread').on('hidden', function () {
$(link[0]).fadeOut(500, function() {
$(this).text(old_innerHTML).fadeIn(500);
});
})
})
</script>
</body>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script src="http://lqdc.github.io/js/nv.d3.min_.js"></script>
<script src="http://lqdc.github.io/js/lotka.js"></script>
<script src="http://lqdc.github.io/js/malware.js"></script>
<!-- Theme: Elegant built for Pelican
License : http://oncrashreboot.com/pelican-elegant -->
</html>