@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: C:\alldata\data\webbib\bib2bib-1.92.exe -oc talks -ob talks.bib -c '($type = "MISC")' cormode.bib}}
@misc{TalkCormodeW8F,
title = {Some sketchy results},
key = {2011sketchy},
note = {Talk at DIMACS Workshop on Algorithms in the Field (8F)},
month = may,
year = 2011,
slides = {../slides/sketchy.pdf},
abstract = {
`Sketch' data structures are simple, efficient, compact summaries of large data sets. Sketches enable approximate computations like similarity comparison and summarization of distributions. There have been a number of successes where use of sketches have been applied in the field, most notably when dealing with very large data sets. In this talk, I'll present some key sketches and their properties, and describe some successes and some non-successes in their application. I'll conclude by outlining emerging applications for sketches, and their future potential.
}
}
@misc{TalkCormodeMergeableTalk11,
title = {Mergeable Summaries},
note = {Talk at Harvard University, DIMACS, Johns Hopkins, University of Pennsylvania, AT&T Labs},
key = {2011mergeable},
month = apr,
year = 2011,
slides = {../slides/mergeable-long.pdf},
abstract = {
In dealing with massive distributed data, exact computations may be possible but can still be very expensive to perform. Instead, it is often desirable to look to more lightweight computations that give (guaranteed) approximations. A central approach is the idea of the mergeable summary: a compact data structure that summarizes a data set, which can be merged with others to summarize the union of the data without significant growth in size. This framework enables parallel computation.
Samples and sketches are two examples of well-known, effective mergeable summaries. I'll present recent results on new, compact mergeable summaries for various tasks, such as approximating frequent items, order statistics, and range counts.
}
}
@misc{TalkCormodeColumbia11,
title = {Data Anonymization},
key = {2011anon},
note = {Guest lecture in 'Dealing with Massive Data' at Columbia University},
slides = {../slides/privacy-columbia.pdf},
month = mar,
year = 2011
}
@misc{TalkMergeable10,
title = {Distributed Summaries},
note = {Talk at DIMACS workshop on Parallelism: a 2020 vision},
year = 2011,
key = {2011mergeableshort},
slides = {../slides/mergeable.pdf},
abstract = {
In dealing with massive distributed data, exact computations may be possible but can still be very expensive to perform. Instead, it is often desirable to look to more lightweight computations that give (guaranteed) approximations. A central approach is the idea of the mergeable summary: a compact data structure that summarizes a data set, which can be merged with others to summarize the union of the data without significant growth in size. This framework enables parallel computation.
Samples and sketches are two examples of well-known, effective mergeable summaries. I'll present recent results on new, compact mergeable summaries for various tasks, such as approximating frequent items, order statistics, and range counts. }
}
@misc{TalkDbirday09,
title = {Anonymization and Uncertainty in Social Network Data},
abstract = {Many privacy questions arise when collecting information about individuals: how to allow use of this data without compromising the privacy of the data subjects? How to ensure that the end users of this data can find useful answers to their queries? Various anonymization techniques have been proposed which aim to find meaningful tradeoffs between privacy and utility. This talk presents the background for privacy and anonymization. Then I'll describe methods for anonymizing social network data, represented as a large, sparse semantic graph, which is additionally challenging due to the complex patterns of interactions between individuals. I'll also discuss some unexpected connections to uncertain data management, which provides many further directions for future work.
This talk covers join work with Divesh Srivastava, Balachander Krishnamuthy, and Smriti Bhagat. },
year = {2009},
month = oct,
key = {dbirday09},
slides = {../slides/dbirday.pdf},
note = {Talk at DBIR Day 2009 at NYU Poly},
pubsite = {http://cis.poly.edu/dbir2009/}
}
@misc{TalkSIP10,
title = {SIPping from the firehose: Streaming Interactive Proofs for verifying computations},
year = {2010},
month = {February},
key = {sip10},
slides = {../slides/sip2010.pdf},
note = {Talk at Bristol Algorithms Days 2010; Fort Meade},
abstract = {
When handling large quantities of data, it is desirable to outsource the computational effort to a third party: this captures current efforts in cloud computing, but also scenarios within trusted computing systems and inter-organizational data sharing. When the third party is not fully trusted, it is desirable to give assurance that the computation has been perfomed correctly. This talk presents some recent results in designing new protocols for verifying computations which are streaming in nature: the verifier (data owner) needs only a single pass over the input, storing a sublinear amount of information, and follows a simple protocol with a prover (serice provider) that takes a small number of rounds. A dishonest prover fools the verifier with only polynomially small probabiliy, while an honest prover's answer is always accepted. Within this model, we show protocols for a variety of problems that select items from the input (e.g. median, heavy hitters), or compute certain aggregates or structures (e.g. frequency moments, number of triangles in a graph). These problems require linear space to compute (exactly) in the traditional streaming model, showing that outsourcing can exponentially reduce the effort needed by the verifier to obtain correct answers.
}
}
@misc{TalkMinimality10,
title = {Progress in Data Anonymization: from k-anonymity to the minimality attack},
year = {2010},
key = {minimality10},
month = {February},
slides = {../slides/minimality.pdf},
note = {Talk in Cheltenham},
abstract = {
The last decade has seen much effort directed to addressing data anonymization: the problem of publishing a modified version of a dataset so that inference about individuals in the data set is limited. As with cryptography, progress has been made by new methods being proposed and their weaknesses being identified, leading to improved algorithms.
The simplest anonymity definition is k-anonymity: every individual in the data set should be indistinguishable from k-1 others. However, if there is homogoneity among this set of individuals then undesired inference is possible. To counter this, definitions such as l-diversity and t-closeness were advanced. Recently it was shown that over eager attempts to enforce these definitions can itself allow unwanted inferences to be drawn. This ``minimality attack'' has stood for several years. But recently it has been shown that this attack is less powerful than first thought: appropriately designed algorithms are invulnerable, and even in the most pessimistic case, the power of the attack can be bounded by a multiplicative factor of e (=2.78...).
In this talk, I will introduce these approaches to anonymization from first principles, working through measure and counter-measure, ending with our recent analysis of the minimality attack.
}
}
@misc{CormodeSrivastava09,
title = {Anonymized Data: Generation, Models, Usage},
author = {Graham Cormode and Divesh Srivastava},
att_authors = {gc2602,ds8961},
att_private = {false},
year = {2009},
note = {Tutorial at SIGMOD 2009},
month = jul,
url = {../papers/anontut.pdf},
slides = {../slides/anontut.ppt},
abstract = {
Data anonymization techniques have been the subject of intense
investigation in recent years, for many kinds of structured data,
including tabular, graph and item set data.
They enable publication of detailed information, which permits ad hoc
queries and analyses, while guaranteeing the privacy of
sensitive information in the data against a variety of attacks.
In this tutorial, we aim to present a \emph{unified} framework of data
anonymization techniques, viewed through the lens of uncertainty.
Essentially, anonymized data describes a set of possible worlds,
one of which corresponds to the original data.
We show that anonymization approaches such as suppression,
generalization, perturbation and permutation generate different
working models of uncertain data, some of which have been well
studied, while others open new directions for research.
We demonstrate that the privacy guarantees offered by methods
such as $k$-anonymization and $l$-diversity can be naturally
understood in terms of similarities and differences in the sets
of possible worlds that correspond to the anonymized data.
We describe how the body of work in query evaluation over
uncertain databases can be used for answering ad hoc queries
over anonymized data in a principled manner.
A key benefit of the unified approach is the identification of
a rich set of new problems for both the Data Anonymization and
the Uncertain Data communities.
}
}
@misc{CormodeSrivastava10,
title = {Anonymized Data: Generation, Models, Usage},
author = {Graham Cormode and Divesh Srivastava},
att_authors = {gc2602,ds8961},
att_private = {false},
year = {2010},
note = {Tutorial at ICDE 2010},
month = mar,
url = {../papers/anontut10.pdf},
slides = {../slides/anontut10.ppt},
abstract = {
Data anonymization techniques have been the subject of intense
investigation in recent years, for many kinds of structured data,
including tabular, graph and item set data.
They enable publication of detailed information, which permits ad hoc
queries and analyses, while guaranteeing the privacy of
sensitive information in the data against a variety of attacks.
In this tutorial, we aim to present a \emph{unified} framework of data
anonymization techniques, viewed through the lens of uncertainty.
Essentially, anonymized data describes a set of possible worlds,
one of which corresponds to the original data.
We show that anonymization approaches such as suppression,
generalization, perturbation and permutation generate different
working models of uncertain data, some of which have been well
studied, while others open new directions for research.
We demonstrate that the privacy guarantees offered by methods
such as $k$-anonymization and $l$-diversity can be naturally
understood in terms of similarities and differences in the sets
of possible worlds that correspond to the anonymized data.
We describe how the body of work in query evaluation over
uncertain databases can be used for answering ad hoc queries
over anonymized data in a principled manner.
A key benefit of the unified approach is the identification of
a rich set of new problems for both the Data Anonymization and
the Uncertain Data communities.
}
}
@misc{TalkMike66-08,
title = {On 'Selection and sorting with limited storage'},
note = {Talk at Mike66 Workshop celebrating Mike Paterson},
url = {../slides/mike66-medians.ppt},
slides = {../slides/mike66-medians.pdf},
month = sep,
year = 2008,
key = {2008},
abstract = {
In 'Selection and Sorting with Limited Storage' [MP78], Munro and Paterson
presented results on the selection (median finding) problem. Their model
allowed a small number of one-way passes over input stored on a read-ony
tape are possible. This paper is now regarded as one of the first works in
the streaming model. I will recap the results in [MP78], and discuss two
more recent results on this theme: approximate selection on a stream with
both 'insertion' and 'deletion' records; and tight lower bounds for
multi-pass median finding when the input data is randomly ordered.}
}
@misc{TalkDagstuhl08,
title = {Algorithms for Distributed Functional Monitoring},
note = {Talk at Dagstuhl Seminar on Sublinear Algorithms},
slides = {../slides/fmonitoring.pdf},
month = aug,
year = 2008,
key = {2008},
abstract = {
We study what we call {\em functional monitoring} problems. We have $k$
players each receiving a stream of items, and communicating with a central
coordinator. Let the multiset of items received by player $i$ up until
time $t$ be $A_i(t)$. The coordinator's task is to monitor a given
function $f$ computed over the union of the inputs $\cup_{i} A_i(t)$, {\em
continuously} at all times $t$. The goal is to minimize the number of
bits communicated between the players and the coordinator. Of interest is
the approximate version where the coordinator outputs $1$ if $f \geq \tau$
and $0$ if $f\leq (1-\epsilon)\tau$. This defines the
$(k,f,\tau,\epsilon)$ distributed, functional monitoring problem.
Functional monitoring problems are fundamental in distributed systems, in
particular sensor networks, where we must minimize communication; they also
connect to problems in streaming algorithms, communication complexity,
communication theory, and signal processing. Yet few formal bounds are
known for functional monitoring.
We give upper and lower bounds for the $(k,f,\tau,\epsilon)$ problem for
some of the basic $f$'s. In particular, we study frequency moments ($F_0,
F_1, F_2$). For $F_0$ and $F_1$, we obtain continuously monitoring
algorithms with costs almost the same as their one-shot computation
algorithms. However, for $F_2$ the monitoring problem seems much harder.
We give a carefully constructed multi-round algorithm that uses ``sketch
summaries'' at multiple levels of detail and solves the
$(k,F_2,\tau,\epsilon)$ problem with communication ${O}~(k^2/\epsilon
+ k^{3/2}/\epsilon^3)$.}
}
@misc{TalkBristol08,
title = {Data Stream Algorithms},
note = {Tutorial at Bristol Summer School on Probabilistic
Techniques in Computer Science. Streaming video available},
url = {../papers/bristol.pdf},
video = {http://www.cs.bris.ac.uk/probtcs08/videos-cormode.jsp},
slides = {../slides/bristol-all.ppt},
month = jul,
year = 2008,
key = {2008},
abstract = {
Many scenarios, such as network analysis, utility monitoring, and
financial applications, generate massive streams of data. These
streams consist of millions or billions of simple updates every hour,
and must be processed to extract the information described in tiny
pieces. These notes provide an introduction to (and set of references
for) {\em data stream
algorithms}, and some of the techniques that have been developed over
recent years to help mine the data while avoiding drowning
in these massive flows of information. }
}
@misc{CormodeGarofalakis06VLDB,
author = {G. Cormode and M. Garofalakis},
att_authors = {gc2602},
att_private = {false},
title = {Streaming in a Connected World: Querying and Tracking
Distributed Data Streams},
note = {Tutorial at VLDB 2006, SIGMOD 2007, EDBT 2008. Partial video available.},
year = {2008},
month = {March},
url = {../papers/cdsummary.pdf},
video = {http://doi.acm.org/10.1145/1247480.1247649},
slides = {../slides/vldb06tut-slides.ppt},
abstract = {
Today, a majority of data is fundamentally distributed in nature. Data for
almost any task is collected over a broad area, and streams in at a much
greater rate than ever before. In particular, advances in sensor
technology and miniaturization have led to the concept of the sensor
network: a (typically wireless) collection of sensing devices collecting
detailed data about their surroundings. A fundamental question arises: how
to query and monitor this rich new source of data? Similar scenarios
emerge within the context of monitoring more traditional, wired networks,
and in other emerging models such as P2P networks and grid-based
computing. In all cases, if we can perform more computational work within
the network to reduce the communication needed, then we can reduce
bandwidth usage and improve performance.
We consider two broad classes of
approaches to such in-network query processing, by analogy to query types
in traditional DBMSs. In the one shot model, a query is issued by a user
at some site, and must be answered based on the current state of data in
the network. We identify several possible approaches to this problem. For
simple queries, partial computation of the result over a tree can reduce
the data transferred significantly. For ``holistic'' queries, such as
medians, count distinct and so on, clever composable summaries give a
compact way to accurately approximate query answers. Lastly, careful
modeling of correlations between measurements and other trends in the data
can further reduce the number of sensors probed. In the continuous model,
a query is placed by a user which requires the answer to be available
continuously. This yields yet further challenges, since even using tree
computation, summarization and modeling, we cannot afford to communicate
every time new data is received by one of the remote sites. Instead, the
result of work on this problem has been a new tradeoff of reduced accuracy
in the query answer for reduced communication cost. This has led to a
variety of techniques for different query types to apportion the available
``uncertainty'' in the query answer between different sites, and to model
the evolution of measured values to anticipate future values and so reduce
communication further.
Our objective in this tutorial is to discuss the
algorithmic foundations of this new world, illustrate some of the powerful
techniques that have been developed to address these challenges, and
outline interesting directions for future work in the area.
Note: these slides are provided in powerpoint format for ease of
use (due to embedded animations) and for use in other presentations.
Please feel free to use these slides for your own purposes; we would
appreciate acknowledgements and citations (to the VLDB 06 proceedings).}
}
@misc{Cormode06Cluster,
title = {Cluster and Data Stream Analysis},
note = {Tutorial at DIMACS Workshop on Data Mining and
Epidemiology},
days = {24},
month = {March},
year = {2006},
key = {2006Cluster},
slides = {../slides/clustertutorial.pdf},
abstract = {
Clustering is an important tool in machine learning and data mining. It
allows features and correlations in the data to be identified and
requires few parameters and little detailed information about the data.
The results can be used to generate hypotheses, aid in visualization, or
reduce the data to a few prototypical points. This 'unsupervised
learning' technique has many variants and many perspectives. I will give
an algorithmic view, describing some of the most popular clustering
algorithms and identifying their pros and cons, including hierarchical
clustering, k-means, expectation maximization (EM) and k-center
approximation algorithms.
When the input data is too large to conveniently hold in memory, or is
being constantly updated, it is necessary to view the data as a massive
stream. In recent years the ``data stream'' model has become a popular
way to handle massive data sources. I will outline some of the key
properties of data streams, and illustrate this with some of the recent
work in clustering on data streams. }
}
@misc{CormodeBertinoro06,
title = {Biased Quantiles},
note = {Bertinoro},
key = {2006BQ},
month = {June},
year = {2006},
slides = {../slides/bquant-long.pdf},
abstract = {
Skew is prevalent in data streams, and should be taken into account
by algorithms that analyze the data.
The problem of finding ``biased quantiles''--- that is, approximate
quantiles which must be more accurate for more extreme values ---
is a framework for summarizing such skewed data on data streams.
We present the first deterministic algorithms for answering biased
quantiles queries accurately with small---sublinear in the input size---
space and time bounds in one pass.
The space bound is near-optimal, and the amortized update cost is
close to constant,
making it practical for handling high speed network data streams.
We demonstrate that not only can we prove theoretical properties of the
algorithm, but that it also uses less space than existing methods in many
practical settings, and is fast to maintain.}
}
@misc{TalkEntropy06,
title = {Computing the Entropy of a stream},
key = {2006},
note = {AT\&T Theory Seminar, Bell Labs Theory Seminar, DyDAn Seminar},
month = {December},
year = {2006},
url = {../slides/entropyslides.pdf},
abstract = {
Recently there has been much interest in using empirical entropy and
related measures to analyze behaviour in networks, and identify
anomalies. At the heart of this is the basic problem of, given a
sequence of tokens, how to efficiently compute the empirical entropy
of the sequence efficiently, in small space, and with a single pass.
I'll describe some work with Amit Chakrabarti and Andrew McGregor on
a simple algorithm for computing the entropy of such a stream, and the
corresponding lower bounds. I'll also discuss extensions to the
`graph entropy', and why higher order entropy is harder to
approximate.}
}
@misc{TalkCSKanpur06,
title = {A Compact Survey of Compressed Sensing},
key = {2006},
note = {Workshop on Algorithms for Data Streams, IIT Kanpur,
India},
month = {December},
year = {2006},
url = {../slides/cs-kanpur.pdf},
link = {http://www.cse.iitk.ac.in/users/sganguly/revised-schedule.html},
abstract = {A short survey of some of the recent results under the
banner of `compressed sensing', from a more algorithmic perspective.}
}
@misc{TalkInvDbn05,
title = {Tracking Inverse Distributions of Massive Data Streams},
key = {2005},
note = {Network Sampling Workshop in Paris, Bell Labs Research
Seminar},
month = {July},
year = {2005},
url = {../slides/invdbn-slides.pdf},
url2 = {../slides/invdbn-paris.pdf},
abstract = {
In order to maintain networks, defend against attacks, monitor service
quality, and track usage patterns, the masive amounts of traffic must be
processed to extract the necessary information. With data of this scale
and speed, even basic analysis tasks become difficult. Most prior
research has focused on questions on the ``forward distribution'' $f(x)$,
such as ``what is the median connection duration?''. I will discuss a new
set of challenges that come from studying the ``inverse distribution''
$f^-1(i)$, such as ``what is the median number of connections made by each
user?'' Tracking the inverse distribution has a variety of applications
in network monitoring, and requires some new techniques based on drawing
an effective sample from the inverse distribution. I will talk about how
to do this for a centralized monitoring point, for a pair of monitoring
points, and finally for a general distributed monitoring architecture.}
}
@misc{TalkDagstuhl05,
title = {Towards an Algorithmic Theory of Compressed Sensing},
key = {2005},
note = {Schloss Dagstuhl},
month = {July},
year = {2005},
abstract = {
In Approximation Theory, the fundamental problem is to
reconstruct a signal $A \in R^n$ from linear measurements
$$ with respect to a dictionary $\Psi$ for
$R^n$.
Recently, there has been
tremendous excitement about the novel direction of
{\em Compressed Sensing}~\cite{Donoho:04}
where the reconstruction can be done with
very few---$O(k log n)$---linear measurements over a modified
dictionary $\Psi'$ if the information of the signal is concentrated in
$k$ coefficients over an orthonormal basis $\Psi$.
These results have reconstruction
error on any given signal that is optimal with respect to
a broad class of signals.
In a series of papers and meetings over the past year, a theory
of Compressed Sensing has been developed by mathematicians.
We develop an algorithmic perspective for the
Compressed Sensing problem, showing that Compressed Sensing
results resonate with prior work in Group Testing, Learning theory
and Streaming algorithms.
Our main contributions are new algorithms that present the
most general results for Compressed Sensing with $1+\epsilon$
approximation on {\em every} signal, faster
algorithms for the reconstruction, as well as succinct
transformations of $\Psi$ to $\Psi'$.}
}
@misc{TalkSkewed05,
title = {Summarizing and Mining Skewed Data Streams},
url = {../slides/skewed-slides.pdf},
key = {2005},
note = {NJIT},
month = {May},
year = {2005}
}
@misc{TalkStrings00,
title = {Short String Signatures},
key = {2000},
note = {DIMACS Workshop on Sublinear Algorithms in Princeton, NJ},
month = {September},
year = {2000},
url = {../slides/skewed-slides.pdf},
pubsite = {http://dimacs.rutgers.edu/Workshops/Sublinear/},
abstract = {
When communication between two parties is the resource, sublinearity is
vital to improve on the trivial solution of one party sending their data
wholesale to the other. In the particular case of approximating distances
between strings, sublinear ''signatures'' can be efficiently computed and
exchanged. These signatures have applications beyoned communication
scenarios, in particular in approximate nearest neighbor algorithms.}
}
@misc{TalkEmbed02,
title = {Embeddings of Metrics on Strings and Permuations},
key = {2002embed},
note = {Workshop on Discrete Metric Spaces and their Algorithmic Applications in Haifa, Israel, March 2002. A short version of this talk was also given at BCTCS},
year = {2002},
month = {March},
url = {../slides/dmsa.pdf},
pubsite = {http://cs.haifa.ac.il/~yuri/DMSA02.html}
}
@misc{TalkText02,
title = {Algorithmic Embeddings for Comparing Large Text Streams},
key = {2002text},
note = {CCR/DIMACS Workshop/Tutorial on Mining Massive Data Sets and Streams: Mathematical Methods and Algorithms for Homeland Defense},
month = {June},
year = {2002},
url = {../slides/mmdss.pdf},
pubsite = {http://dimacs.rutgers.edu/Workshops/Homeland/},
abstract = {
Texts are ubiquitous in daily life, varying in size from small (SMS and
email) to potentially immense (automatically generated reports, biological
sequences). When scanning for similarities between new data and
previously stored examples, we need a model that takes account of how
texts are changed: pieces are inserted or deleted, and sections are moved
around. With a large number of large texts, trying to compare all pairs
is not feasible, and for the largest texts we may not even be able to hold
the whole of any text in memory at once. We describe an approach to
approximately comparing large texts quickly and in sublinear space. It
relies on finding combinatorial structures present in any string of
characters, and generating a vector representation. This allows rapid
comparison of sequences based on a succint representation, and the
application of clustering, nearest neighbor searching, and other data
mining techniques.}
}
@misc{TalkHot03,
title = {Tracking Frequent Items Dynamically},
note = {Institute of Advanced Studies, DIMACS, Stonybrook and U. Penn},
year = {2003},
key = {2003hot},
url = {../slides/whatshot-long.pdf},
abstract = {
Most database management systems maintain statistics on the underlying relation. One of the important statistics is that of the ''hot items'' in the relation: those that appear many times (most frequently, or more than some threshold). For example, end-biased histograms keep the hot items as part of the histogram and are used in selectivity estimation. Hot items are used as simple outliers in data mining, and in anomaly detection in networking applications.
I will describe some novel solutions to this problem based on viewing this as a group testing problem. We give approaches using both adaptive and non-adaptive group testing. These algorithms maintain a small space data structure that monitors the transactions on the relation, and when required, quickly output all hot items, without rescanning the relation in the database. With user-specified probability, it is able to report all hot items. I will then go on to describe some work in progress on extensions to this model, when we wish to find items whose frequency has changed by a significant amount, either in absolute or relative terms; and also finding related items which together are ''hot'' but individually are not. }
}
@misc{TalkL003,
title = {Zeroing in on the L0 Metric},
month = {August},
year = {2003},
key = {2003L0},
note = { DIMACS Workshop on Discrete Metric Spaces and their Algorithmic Applications at Princeton},
url = {../slides/L0metricspaces.pdf},
pubsite = {http://dimacs.rutgers.edu/Workshops/MetricSpaces/},
abstract = {
The L0 metric is defined to be the number of positions where two vectors differ. This can be challenging to build small space approximations for when the vectors represent dynamic data sets that are being updated (streaming fashion). The general solution presented here gives a highly flexible approximation scheme, which can be applied to other problems such as approximating the Dominance Norm, a measure of worst case influence, of very large numbers of signals.}
}
@misc{TalkHotNew03,
title = {What's Hot, What's Not, What's New and What's Next},
key = {2003new},
year = {2003},
month = {October},
note = { Bell Labs, October 2003. A shorter version was given at the DIMACS Mixer at AT&T Labs.},
url = {../slides/cmhotnext.pdf},
pubsite = {http://dimacs.rutgers.edu/Events/2003/program-mix1.html},
abstract = {
Algorithmic work on processing massive data streams has focussed on producing efficient summaries of streams to answer particular queries. The count-min sketch answers point, range and inner product queries, and can be used as the basis of algorithms to find approximate heavy hitters and quantiles. It improves space and update time over previous solutions for dynamic data streams and the analysis is quite simple. }
}
@misc{TalkGames04,
title = {How hard are computer games?},
key = {2004games},
year = {2004},
month = {February},
note = {DIMACS, February 2004.},
url = {../slides/lemmingstalk.pdf},
abstract = {
Computer Scientists often devote a great amount of time and effort in
playing computer games. With the intention of legitimizing this endeavour
in the name of academic research, I will focus on the algorithmic
complexity of computer games. I'll begin by surveying some of the known
results and outline the proof techniques, for such well known games as
Minesweeper, Sokoban and Tetris.
In the game of Lemmings, the player must guide a tribe of green-haired
lemming creatures to safety, and save them from an untimely demise. I
will describe and analyze the problem of deciding whether or not it is
possible to complete a level of this game (and hence to find a solution to
that level). Under certain limitations, this can be decided in polynomial
time, but I will show that in general the problem is NP-Hard. I will
outline my latest results, to show that this hardness holds even if there
is only a single lemming to save, thus showing that it is hard to
approximate the number of saved lemmings to any factor.}
}
@misc{TalkNetwork04,
title = {Algorithms for Processing Massive Data at Network Line Speed},
key = {2004net},
year = {2004},
month = {March},
url = {../slides/cmtalk.pdf},
abstract = {Many fundamental data mining problems gain additional complexity when applied to massive amounts data that is being generated at high speed. Networks are a driving force in massive data analysis, and require new approaches to produce the analyses network managers require. This motivates the data stream approach: using a small amount of space to process data very quickly in one pass. This has resulted in many strong and deep algorithmic results in this model of computation. But there is frequently a gap between theory and practice, since existing algorithms are too slow for typical high data rates by several orders of magnitude.
I will describe my recent work aimed at bridging the gap. Firstly, I will describe the Count-Min sketch, which is a simple and hence fast data structure that can be used to approximate entries of a high dimensional vector in very small space. It can be used as a ''black box'' to solve several problems, including finding frequent items, and quantiles of the data distribution, and gives provable guarantees about their results. Secondly, I will describe and analyze a solution to the change detection problem, where the aim is to identify items (eg IP addresses) whose behavior has changed between two periods. Both these methods have been implemented and are running on network data at network line speeds with high accuracy. }
}
@misc{TalkWDSA07,
title = {Computational Fundamentals of Analyzing and Mining Data
Streams},
key = {2007WDSA},
year = {2007},
note = {Tutorial at Workshop on Data Stream Analysis, Caserta,
Italy},
month = {March},
url = {../papers/cormodewdsa.pdf},
slides = {../slides/streammining.pdf},
abstract = { Many scenarios, such as network analysis, utility
monitoring, and financial applications, generate massive streams of data.
These streams consist of millions or billions of simple updates every
hour, and must be processed to extract the information described in tiny
pieces. This survey will introduce the problems of data stream monitoring,
and some of the techniques that have been developed over recent years to
help find the nuggets of gold while avoiding drowning in these massive
streams of information.
In particular, this talk will introduce the fundamental techniques used to
create compact summaries of data streams: sampling, sketching, and other
synopsis techniques. It will describe how to extract features from the
stream such as frequent items, medians, and association rules. Lastly, we
will see methods to detect when and how the process generating the stream
is evolving, indicating some important change has occurred.
}
}
@misc{TalkGraphstream09,
title = {Processing Graph Streams: Upper and Lower Bounds},
key = {2009graph},
year = 2009,
month = jun,
note = {Talk at Workshop on Algorithms and Models for Complex Networks, Bristol UK},
slides = {../slides/graphsbristol.pdf},
abstract = {
In the graph streaming model of computation, we see the input graph presented as a sequence of edges, in some arbitrary order, and must perform computations with one or few passes over this input. Crucial parameters are the amount of working memory needed, and the time per update.
In this talk, I'll discuss some of the key results in this youthful field.
These include:
\begin{itemize}
\item
lower bounds on the memory needed for fundamental computations
(testing connectedness and bipartiteness)
\item
very small memory algorithms for approximating properties of the degree
sequence and counting small subgraphs
\item
constructing spanners and matchings with memory proportional to the
number of nodes but sublinear in the number of edges
\item
extensions to cases where each edge may be duplicated multiple times
\end{itemize}}
}
@misc{TalkFreq09,
title = {Finding Frequent Items in Data Streams},
key = {2009Freq},
year = 2009,
note = {Talk at DIMACS Working group on Streaming, Coding and Compressive Sensing; AT\&T Labs; UMass Amherst; Dartmouth College},
month = {March},
slides = {../slides/freqslides.pdf},
abstract = {
The frequent items problem is to process a stream of items and find
all items occurring more than a given fraction of the time. It is one
of the most heavily studied problems in data stream mining, dating
back to the 1980s. Many applications rely directly or indirectly
on finding the frequent items, and implementations are in use in
large scale industrial systems. In this talk, I survey the problem, and present the most important algorithm in a common framework. I'll also show some experimental comparisons, and discuss some more recent results which explain why some of these algorithms give even better results than their initial analysis led us to believe.
}
}
@misc{TalkWeb207,
title = {Analyzing Web 2.0, Blogs and Social Networks},
key = {2007Web},
year = 2007,
month = dec,
note = {Talk at AT\&T Labs}
}
This file was generated by bibtex2html 1.92.