@comment{{This file has been generated by bib2bib 1.92}}

@comment{{Command line: C:\alldata\data\webbib\bib2bib-1.92.exe -oc talks -ob talks.bib -c '($type = "MISC")' cormode.bib}}

@misc{CormodeJhaKulkarniLiSrivastavaWang18, author = {Graham Cormode and Somesh Jha and Tejas Kulkarni and Ninghui Li and Divesh Srivastava and Tianhao Wang}, title = {Privacy at Scale: Local Differential Privacy in Practice}, note = {Tutorial at SIGMOD and KDD}, year = 2018, url = {../papers/ldptutorial.pdf}, abstract = {Local differential privacy (LDP), where users randomly perturb their inputs to provide plausible deniability of their data without the need for a trusted party, has been adopted recently by several major technology organizations, including Google, Apple and Microsoft. This tutorial aims to introduce the key technical underpinnings of these deployed systems, to survey current research that addresses related problems within the LDP model, and to identify relevant open problems and research directions for the community.} }

@misc{TalkRR17, key = {RR17}, month = jan, year = 2018, title = {Distributed Private Data Collection at Scale}, note = {Talk at HiPEDs CDT (Imperial)}, slides = {../slides/scalable18.pdf}, abstract = { } }

@misc{TalkMarginals17, key = {Marginals 17}, month = nov, year = 2017, title = {Locally Private Release of Marginal Statistics}, note = {Talk at Google (Zurich) Algorithms and Optimization Day}, slides = {../slides/rrgoog17-split.pdf} }

@misc{TalkConfounding17, key = {Confounding 17}, month = sep, year = 2017, title = {The confounding problem of private data release}, note = {Talk at EPSRC Workshop on Future Research Directions in Demand Management, Oxford University (CS)}, slides = {../slides/confounding17.pdf}, abstract = {The demands to make data available are growing ever louder, to support open data initiatives and ``data monetization''. But the problem of doing so without disclosing confidential information is a subtle and difficult one. Is ``private data release'' an oxymoron? This talk delves into the motivations for data release, and highlights some of the pitfalls. I'll conclude by describing some recent work on algorithms that meet a statistical guarantee of (differential) privacy, building on a 50-year old algorithm that tosses a single (biased) coin.} }

@misc{TalkINI16, key = {INI 2016}, month = nov, year = 2016, note = {Talk at Isaac Newton Institute}, title = {Engineering Privacy for Small Groups}, slides = {../slides/smallgroups.pdf}, abstract = { Concern about how to collect sensitive user data without compromising individual privacy is a major barrier to greater availability of data. The model of Local Differential Privacy has recently gained favor and enjoys widespread adoption for data gathering from browsers and apps. Deployed methods use Randomized Response, which applies only when the user data is a single bit. We study general mechanisms for data release which allow the release of statistics from small groups. We formalize this by introducing a set of desirable properties that such mechanisms can obey. Any combination of these can be satisfied by solving a linear program which minimizes a cost function. We also provide explicit constructions that are optimal for certain combinations of properties, and show a closed form for their cost. In the end, there are only three distinct optimal mechanisms to choose between: one is the well-known (truncated) geometric mechanism; the second a novel mechanism that we introduce, and the third is found as the solution to a particular LP. Through a set of experiments on real and synthetic data we determine which is preferable in practice, for different combinations of data distributions and privacy parameters. Joint work with Tejas Kulkarni and Divesh Srivastava }, video = {http://www.newton.ac.uk/seminar/20161110153016302} }

@misc{TalkSEA17, key = {SEA 2017}, month = jun, year = 2017, title = {Engineering Streaming Algorithms}, note = {Invited talk at Symposium on Experimental Algorithms}, slides = {../slides/sea-esa.pdf}, abstract = {Streaming algorithms must process a large quantity of small updates quickly to allow queries about the input to be answered from a small summary. Initial work on streaming algorithms laid out theoretical results, and subsequent efforts have involved engineering these for practical use. Informed by experiments, streaming algorithms have been widely implemented and used in practice. This talk will survey this line of work, and identify some lessons learned.} }

@misc{TalkDisc16, key = {Disc 2016}, month = sep, year = 2016, title = {Matching and Covering in Streaming Graphs}, note = {Invited keynote talk at DISC 2016}, slides = {../slides/disc16.pdf}, abstract = {Problems related to (maximum) matchings and vertex covering in graph have a long history in Combinatorics and Computer Science. They arise in many contexts, from choosing which advertisements to display to online users, to characterizing properties of chemical compounds. Stable matchings have a suite of applications, from assigning students to universities, to arranging organ donations. These have been addressed in a variety of different computation models, from the traditional RAM model, to more recent sublinear (property testing) and external memory (MapReduce) models. Matching has also been studied for a number of classes of input graph: including general graphs, bipartite graphs, weighted graphs, and those with some sparsity structure. We focus on the streaming case, where each edge is seen once only, and we are restricted to space sublinear in the size of the graph (ie., no. of its vertices). In this case, the objective is to find (approximately) the size of the matching. Even here, results for general graphs are either weak or make assumptions about the input or the stream order. In this talk, we describe work which seeks to improve the guarantees in various ways. First, we consider the case when we are given a promise on the size of the solution: the matching is of size at most $k$, say. This puts us in the realm of parameterized algorithms and kernelization, but with a streaming twist. We show that algorithms to find a maximal matching can have space which grows quadratically with $k$. Second, we consider restricting to graphs that have some measure of sparsity — bounded arboricity, or bounded degree. This aligns with reality, where most massive graphs have asymptotically fewer than $O(n^2)$ edges. In this case, we show algorithms whose space cost is polylogarithmic in the size of the input, multiplied by a constant that depends on the level of sparsity, in order to estimate the size of the maximum matching. The techniques used rely on ideas of sampling and sketching, developed to handle data which arrives as a stream of observations, coupled with analysis of the resulting randomized algorithms.} }

@misc{TalkConfounding16, year = {2016}, key = {Confounding}, month = sep, title = {The confounding problem of private data release}, note = {Invited talk at Heilbronn Conference; WMG; Liverpool University }, slides = {../slides/confounding16.pdf}, abstract = { The demands to make data available are growing ever louder, including open data initiatives and freedom of information requests. But the problem of doing so without disclosing confidential information is a subtle and difficult one. Is “private data release” an oxymoron? This talk aims to delve into the motivations of data release, explore the challenges, and outline some of the current statistical approaches developed in response to this confounding problem.} }

@misc{TalkCorrel16, title = {Sub-Quadratic Recovery of Correlated Pairs}, key = {subq}, year = 2016, month = jun, note = {Talk at Google Research, Facebook, Simons Institute, Manchester U., LSE}, link = {https://simons.berkeley.edu/talks/graham-cormode-2016-06-29}, video = {https://www.youtube.com/watch?v=ZGk5bNKzuyw}, slides = {../slides/correl.pdf}, abstract = {Identifying correlations within multiple streams of high-volume time series is a general but challenging problem. A simple exact solution has cost that is linear in the dimensionality of the data, and quadratic in the number of streams. In this work, we use dimensionality reduction techniques (sketches), along with ideas derived from coding theory and fast matrix multiplication to allow fast (subquadratic) recovery of those pairs that display high correlation. } }

@misc{TalkStreaming15, note = {Invited tutorial in PODS 2015 and BICOD 2015}, month = may, key = {stream15}, year = 2015, title = {Compact Summaries over Large Datasets}, url = {../papers/pods204t-cormode.pdf}, slides = {../slides/streaming15.pdf}, abstract = { A fundamental challenge in processing the massive quantities of information generated by modern applications is in extracting suitable representations of the data that can be stored, manipulated and interrogated on a single machine. A promising approach is in the design and analysis of compact summaries: data structures which capture key features of the data, and which can be created effectively over distributed data sets. Popular summary structures include the count distinct algorithms, which compactly approximate item set cardinalities, and sketches which allow vector norms and products to be estimated. These are very attractive, since they can be computed in parallel and combined to yield a single, compact summary of the data. This tutorial introduces the concepts and examples of compact summaries. } }

@misc{TalkSIP15, year = 2015, key = {SIP15}, month = apr, title = {Trusting the Cloud with Practical Interactive Proofs}, note = {Talk at Google NYC, Bristol, Oxford Algorithms Day, Durham}, slides = {../slides/sip2015.pdf}, video = {https://www.youtube.com/watch?v=v5mWIqEJXK0&feature=youtu.be&list=PLqxsGMRlY6u659-OgCvs3xTLYZztJpEcW}, abstract = { When handling large quantities of data, it is often necessary to outsource the computational effort to a third party: this captures current efforts in cloud computing, but also scenarios within trusted computing systems and inter-organizational data sharing. When the third party is not fully trusted, it is important to give assurance that the computation has been performed correctly. This talk presents some recent results in designing new protocols for verifying computations which are streaming in nature: the verifier (data owner) needs only a single pass over the input, storing a sublinear amount of information, and follows a simple protocol with a prover (service provider) that takes a small number of rounds. A dishonest prover fools the verifier with only vanishingly small probability, while an honest prover's answer is always accepted. Starting from basic aggregations, interactive proof techniques allow a quite general class of computations to be verified, leading to practical implementations. } }

@misc{TalkEDBTICDT15, year = {2015}, key = {EDBT14}, month = mar, title = {The confounding problem of private data release}, note = {Invited talk at EDBT/ICDT 2015}, slides = {../slides/confounding.pdf}, url = {../papers/confounding.pdf}, abstract = { The demands to make data available are growing ever louder, including open data initiatives and ``data monetization''. But the problem of doing so without disclosing confidential information is a subtle and difficult one. Is ``private data release'' an oxymoron? This paper (accompanying an invited talk) aims to delve into the motivations of data release, explore the challenges, and outline some of the current statistical approaches developed in response to this confounding problem. } }

@misc{TalkCopenhagen14, year = {2014}, key = {Copenhagen14}, month = jul, title = {Sketches, Streaming and Big Data}, note = {Summer school on Hashing at University of Copenhagen}, slides = {http://www.diku.dk/summer-school-2014/course-material/graham-cormode/}, video = {http://www.diku.dk/summer-school-2014/course-material/graham-cormode/} }

@misc{TalkYandex13, year = {2013}, key = {Yandex13}, month = sep, title = {Sketch data structures and concentration bounds / Mergeable summaries}, note = {Invited tutorial at Yandex conference}, slides = {../slides/yandex.pdf}, video = {http://shad.yandex.ru/conference/cormode.xml} }

@misc{TalkKDD14, year = {2014}, key = {KDD14}, month = aug, title = {Sampling for Big Data}, abstract = {One response to the proliferation of large datasets has been to develop ingenious ways to throw resources at the problem, using massive fault tolerant storage architectures, parallel and graphical computation models such as MapReduce, Pregel and Giraph. However, not all environments can support this scale of resources, and not all queries need an exact response. This motivates the use of sampling to generate summary datasets that support rapid queries, and prolong the useful life of the data in storage. To be effective, sampling must mediate the tensions between resource constraints, data characteristics, and the required query accuracy. The state-of-the-art in sampling goes far beyond simple uniform selection of elements, to maximize the usefulness of the resulting sample. This tutorial reviews progress in sample design for large datasets, including streaming and graph-structured data. Applications are discussed to sampling network traffic and social networks. }, slides = {../slides/TutorialKDD2014.pptx}, video = {http://videolectures.net/kdd2014_cormode_duffield_sampling_data/}, note = {Tutorial at SIGKDD 2014 conference} }

@misc{TalkSimons13, year = {2013}, key = {Simons13}, month = sep, title = {Streaming, Sketching and Sufficient Statistics}, note = {Invited tutorial at Big Data Boot Camp, Simons Institute for Theoretical Computer Science, Berkeley}, slides = {http://simons.berkeley.edu/sites/default/files/docs/529/cormodeslides.pdf}, video = {http://www.youtube.com/watch?v=-QSkMcPmXN8}, abstract = {One natural way to deal with the challenge of Big Data is to make the data smaller. That is, to seek a compact (sublinear) representation of the data so that certain properties are (approximately) preserved. We can think of these as a generalization of sufficient statistics for properties of the data. The area of ``streaming algorithms'' seeks algorithms which can build such a summary as information is processed incrementally. An important class of streaming algorithms are sketches: carefully designed random projections of the input data that can be computed efficiently under the constraints of the streaming model. These have the attractive property that they can be easily computed in parallel over partitions of the input. They aim at optimizing a variety of properties: the size of the summary; the time required to compute the summary; the number of 'true' random bits required; and the accuracy guarantees that result. This tutorial will present some of the powerful results that have emerged from these efforts: * sketches for sparse recovery over high dimensional inputs, with applications to compressed sensing and machine learning; * effective sketches for Euclidean space, providing fast instantiations of the Johnson-Lindenstrauss Lemma; * techniques for sampling from the support of a vector, and estimating the size of the support set; * applications of these ideas to large graph computations and linear algebra; * lower bounds on what is possible to summarize, arising from Communication Complexity and Information Complexity. } }

@misc{Talkdatsci13, year = 2013, key = {2013datsi}, month = jun, title = {Computing + Statistics = Data Science}, note = {An introduction to data science for teenagers, IGGY DUX awards, Experience Warwick.}, slides = {../slides/datsciout.pdf} }

@misc{Talkbigprivacy13, key = {2013abudhabi}, year = 2013, month = mar, title = {Privacy and Big Data: Challenges and Promise}, note = {Invited panel at NYU Abu Dhabi conference on Big Data Systems, Applications, and Privacy }, slides = {../slides/abudhabiprivacy.pdf} }

@misc{TalkREU13, year = 2013, key = {2013REU}, month = mar, title = {Current Industry Trends in Computer Science Research}, slides = {../slides/reu2013.pdf}, note = {Invited Talk/Panel at NSF Research Experience for Undergraduates PI Meeting} }

@misc{TalkCiE13, key = {CiE}, title = {Summary Data Structures for Massive Data}, year = 2013, month = jul, note = {Invited talk in Session on Data Streams and Compression, Computability in Europe 2013}, url = {../papers/cie13.pdf}, slides = {../slides/cie13.pdf}, abstract = { Prompted by the need to compute holistic properties of increasingly large data sets, the notion of the ``summary'' data structure has emerged in recent years as an important concept. Summary structures can be built over large, distributed data, and provide guaranteed performance for a variety of data summarization tasks. Various types of summaries are known: summaries based on random sampling; summaries formed as linear sketches of the input data; and other summaries designed for a specific problem at hand. } }

@misc{TalkPrivDB13, key = {PrivDB}, title = {Building Blocks of Privacy: Differentially Private Mechanisms}, year = 2013, month = apr, slides = {../slides/privdb-tutorial.pdf}, note = {Invited tutorial talk at Privacy Preserving Data Publication and Analysis (PrivDB) workshop}, abstract = {The model of differential privacy has become widely accepted as a suitable way to release information about private data while preserving the privacy of the individuals contributing to the data. One reason for its popularity is the availability of several ``mechanisms'' that produce output meeting the differential privacy guarantee. A second reason is that there are simple rules for reasoning about the composition of multiple differentially private outputs. Taken together, this provides a set of building blocks and cement which can be used to produce algorithms for private data release. In this tutorial, I will review some of the popular mechanisms for differential privacy, and show some of the ways that they can be used for a variety of different data releases}, notes = {Tutorial talk at PrivDB 2013 workshop (at ICDE 2013)} }

@misc{TalkDataDrivenPrivacy12, title = {Data-driven concerns in Private Data Release}, month = sep, year = 2012, key = {2012Privacy}, slides = {../slides/datadrivendifferential.pdf}, abstract = {Many users are clamouring for privacy techniques so that they can publish, share or sell their data safely. There are many aspects of data that then have to be managed: the data is often sparse, multi-dimensional, and structured. Each of these requires mechanisms that are aware of these features, so that they provide output which is useful and efficient to compute, in addition to providing a privacy guarantee. This talk outlines some recent efforts to generate such practical mechanisms. }, note = {Talk at Stevens Institute of Technology; AT\&T Labs; UMass Amherst; Rutgers University-Newark; Bell Labs; NYU-Abu Dhabi.} }

@misc{TalkVerification13, title = {Streaming Verification of Outsourced Computation}, month = may, year = 2013, key = {2013verify}, slides = {http://research.microsoft.com/en-us/events/bda2013/sip2013.pdf}, video = {http://research.microsoft.com/apps/video/default.aspx?id=193503&l=i}, abstract = {When handling large quantities of data, it is desirable to outsource the computational effort to a third party: this captures current efforts in cloud computing, but also scenarios within trusted computing systems and inter-organizational data sharing. When the third party is not fully trusted, it is desirable to give assurance that the computation has been performed correctly. This talk presents some recent results in designing new protocols for verifying computations which are streaming in nature: the verifier (data owner) needs only a single pass over the input, storing a sublinear amount of information, and follows a simple protocol with a prover (service provider) that takes a small number of rounds. A dishonest prover fools the verifier with only polynomially small probability, while an honest prover's answer is always accepted. Starting from basic aggregations, interactive proof techniques allow a quite general class of computations to be verified, leading to practical implementations.}, note = {Talk at Big Data Analytics Workshop, Microsoft Research Cambridge, and University of Warwick} }

@misc{TalkCormode11CD, title = {Continuous Distributed Monitoring: A Short Survey}, note = {Invited keynote at Algorithms and Models for Distributed Event Processing (AlMoDEP)}, month = sep, year = 2011, key = {2011CormodeCD}, link = {Cormode11CD.html}, slides = {../slides/cdsurvey.pdf}, abstract = {In the model of continuous distributed monitoring, a number of observers each see a stream of observations. Their goal is to work together to compute a function of the union of their observations. This can be as simple as counting the total number of observations, or more complex non-linear functions such as tracking the entropy of the induced distribution. Assuming that it is too costly to simply centralize all the observations, it becomes quite challenging to design solutions which provide a good approximation to the current answer, while bounding the communication cost of the observers, and their other resources such as their space usage. This survey introduces this model, and describe a selection results in this setting, from the simple counting problem to a variety of other functions that have been studied.} }

@misc{TalkSAMSI12, title = {Sketches: Past, Present and Future}, key = {2012samsi}, year = 2012, slides = {}, note = {Invited Panel on Sketching and Streaming at SAMSI Workshop, 2012} }

@misc{TalkSSBD12, title = {Small Summaries for {Big Data}}, key = {2012ssbd}, note = {Talk at Duke ARO workshop on Big Data at Large; MSR Cambridge; Princeton.}, year = 2012, abstract = { In dealing with big data, we often need to look at a small summary to get the big picture. Over recent years, many new techniques have been developed which allow important properties of large distributions to be extracted from compact and easy-to-build summaries. In this talk, I'll highlight some examples of different types of summarization: sampling, sketching, and special-purpose. Then I'll outline the road ahead for further development of such summaries. } }

@misc{TalkCormodeW8F, title = {Some sketchy results}, key = {2011sketchy}, note = {Talk at DIMACS Workshop on Algorithms in the Field (8F)}, month = may, year = 2011, slides = {../slides/sketchy.pdf}, video = {http://www.youtube.com/watch?v=M1QgeUIUlQM}, abstract = { `Sketch' data structures are simple, efficient, compact summaries of large data sets. Sketches enable approximate computations like similarity comparison and summarization of distributions. There have been a number of successes where use of sketches have been applied in the field, most notably when dealing with very large data sets. In this talk, I'll present some key sketches and their properties, and describe some successes and some non-successes in their application. I'll conclude by outlining emerging applications for sketches, and their future potential. } }

@misc{TalkCormodeMergeableTalk11, title = {Mergeable Summaries}, note = {Talk at Harvard University; DIMACS; Johns Hopkins; University of Pennsylvania; AT\&T Labs; Warwick University}, key = {2011mergeable}, month = apr, year = 2011, slides = {../slides/mergeable-long.pdf}, abstract = { In dealing with massive distributed data, exact computations may be possible but can still be very expensive to perform. Instead, it is often desirable to look to more lightweight computations that give (guaranteed) approximations. A central approach is the idea of the mergeable summary: a compact data structure that summarizes a data set, which can be merged with others to summarize the union of the data without significant growth in size. This framework enables parallel computation. Samples and sketches are two examples of well-known, effective mergeable summaries. I'll present recent results on new, compact mergeable summaries for various tasks, such as approximating frequent items, order statistics, and range counts. } }

@misc{TalkCormodeColumbia11, title = {Data Anonymization}, key = {2011anon}, note = {Guest lecture in 'Dealing with Massive Data' at Columbia University}, slides = {../slides/privacy-columbia.pdf}, month = mar, year = 2011 }

@misc{TalkMergeable10, title = {Distributed Summaries}, note = {Talk at DIMACS workshop on Parallelism: a 2020 vision}, year = 2011, key = {2011mergeableshort}, slides = {../slides/mergeable.pdf}, abstract = { In dealing with massive distributed data, exact computations may be possible but can still be very expensive to perform. Instead, it is often desirable to look to more lightweight computations that give (guaranteed) approximations. A central approach is the idea of the mergeable summary: a compact data structure that summarizes a data set, which can be merged with others to summarize the union of the data without significant growth in size. This framework enables parallel computation. Samples and sketches are two examples of well-known, effective mergeable summaries. I'll present recent results on new, compact mergeable summaries for various tasks, such as approximating frequent items, order statistics, and range counts. } }

@misc{TalkDbirday09, title = {Anonymization and Uncertainty in Social Network Data}, abstract = {Many privacy questions arise when collecting information about individuals: how to allow use of this data without compromising the privacy of the data subjects? How to ensure that the end users of this data can find useful answers to their queries? Various anonymization techniques have been proposed which aim to find meaningful tradeoffs between privacy and utility. This talk presents the background for privacy and anonymization. Then I'll describe methods for anonymizing social network data, represented as a large, sparse semantic graph, which is additionally challenging due to the complex patterns of interactions between individuals. I'll also discuss some unexpected connections to uncertain data management, which provides many further directions for future work. This talk covers join work with Divesh Srivastava, Balachander Krishnamuthy, and Smriti Bhagat. }, year = {2009}, month = oct, key = {dbirday09}, slides = {../slides/dbirday.pdf}, note = {Invited talk at DBIR Day 2009 at NYU Poly}, pubsite = {http://cis.poly.edu/dbir2009/} }

@misc{TalkSIP10, title = {SIPping from the firehose: Streaming Interactive Proofs for verifying computations}, year = {2010}, month = {February}, key = {sip10}, slides = {../slides/sip2010.pdf}, note = {Invited talk at Bristol Algorithms Days 2010; University of Maryland}, abstract = { When handling large quantities of data, it is desirable to outsource the computational effort to a third party: this captures current efforts in cloud computing, but also scenarios within trusted computing systems and inter-organizational data sharing. When the third party is not fully trusted, it is desirable to give assurance that the computation has been perfomed correctly. This talk presents some recent results in designing new protocols for verifying computations which are streaming in nature: the verifier (data owner) needs only a single pass over the input, storing a sublinear amount of information, and follows a simple protocol with a prover (serice provider) that takes a small number of rounds. A dishonest prover fools the verifier with only polynomially small probabiliy, while an honest prover's answer is always accepted. Within this model, we show protocols for a variety of problems that select items from the input (e.g. median, heavy hitters), or compute certain aggregates or structures (e.g. frequency moments, number of triangles in a graph). These problems require linear space to compute (exactly) in the traditional streaming model, showing that outsourcing can exponentially reduce the effort needed by the verifier to obtain correct answers. } }

@misc{TalkMinimality10, title = {Progress in Data Anonymization: from k-anonymity to the minimality attack}, year = {2010}, key = {minimality10}, month = {February}, slides = {../slides/minimality.pdf}, note = {Talk in Bristol}, abstract = { The last decade has seen much effort directed to addressing data anonymization: the problem of publishing a modified version of a dataset so that inference about individuals in the data set is limited. As with cryptography, progress has been made by new methods being proposed and their weaknesses being identified, leading to improved algorithms. The simplest anonymity definition is k-anonymity: every individual in the data set should be indistinguishable from k-1 others. However, if there is homogoneity among this set of individuals then undesired inference is possible. To counter this, definitions such as l-diversity and t-closeness were advanced. Recently it was shown that over eager attempts to enforce these definitions can itself allow unwanted inferences to be drawn. This ``minimality attack'' has stood for several years. But recently it has been shown that this attack is less powerful than first thought: appropriately designed algorithms are invulnerable, and even in the most pessimistic case, the power of the attack can be bounded by a multiplicative factor of e (=2.78...). In this talk, I will introduce these approaches to anonymization from first principles, working through measure and counter-measure, ending with our recent analysis of the minimality attack. } }

@misc{CormodeSrivastava09, title = {Anonymized Data: Generation, Models, Usage}, author = {Graham Cormode and Divesh Srivastava}, att_authors = {gc2602,ds8961}, att_private = {false}, year = {2009}, note = {Tutorial at SIGMOD 2009}, month = jul, url = {../papers/anontut.pdf}, slides = {../slides/anontut.ppt}, abstract = { Data anonymization techniques have been the subject of intense investigation in recent years, for many kinds of structured data, including tabular, graph and item set data. They enable publication of detailed information, which permits ad hoc queries and analyses, while guaranteeing the privacy of sensitive information in the data against a variety of attacks. In this tutorial, we aim to present a \emph{unified} framework of data anonymization techniques, viewed through the lens of uncertainty. Essentially, anonymized data describes a set of possible worlds, one of which corresponds to the original data. We show that anonymization approaches such as suppression, generalization, perturbation and permutation generate different working models of uncertain data, some of which have been well studied, while others open new directions for research. We demonstrate that the privacy guarantees offered by methods such as $k$-anonymization and $l$-diversity can be naturally understood in terms of similarities and differences in the sets of possible worlds that correspond to the anonymized data. We describe how the body of work in query evaluation over uncertain databases can be used for answering ad hoc queries over anonymized data in a principled manner. A key benefit of the unified approach is the identification of a rich set of new problems for both the Data Anonymization and the Uncertain Data communities. } }

@misc{CormodeSrivastava10, title = {Anonymized Data: Generation, Models, Usage}, author = {Graham Cormode and Divesh Srivastava}, att_authors = {gc2602,ds8961}, att_private = {false}, year = {2010}, note = {Tutorial at ICDE 2010}, month = mar, url = {../papers/anontut10.pdf}, slides = {../slides/anontut10.ppt}, abstract = { Data anonymization techniques have been the subject of intense investigation in recent years, for many kinds of structured data, including tabular, graph and item set data. They enable publication of detailed information, which permits ad hoc queries and analyses, while guaranteeing the privacy of sensitive information in the data against a variety of attacks. In this tutorial, we aim to present a \emph{unified} framework of data anonymization techniques, viewed through the lens of uncertainty. Essentially, anonymized data describes a set of possible worlds, one of which corresponds to the original data. We show that anonymization approaches such as suppression, generalization, perturbation and permutation generate different working models of uncertain data, some of which have been well studied, while others open new directions for research. We demonstrate that the privacy guarantees offered by methods such as $k$-anonymization and $l$-diversity can be naturally understood in terms of similarities and differences in the sets of possible worlds that correspond to the anonymized data. We describe how the body of work in query evaluation over uncertain databases can be used for answering ad hoc queries over anonymized data in a principled manner. A key benefit of the unified approach is the identification of a rich set of new problems for both the Data Anonymization and the Uncertain Data communities. } }

@misc{TalkMike66-08, title = {On 'Selection and sorting with limited storage'}, note = {Talk at Mike66 Workshop celebrating Mike Paterson}, url = {../slides/mike66-medians.ppt}, slides = {../slides/mike66-medians.pdf}, month = sep, year = 2008, key = {2008}, abstract = { In 'Selection and Sorting with Limited Storage' [MP78], Munro and Paterson presented results on the selection (median finding) problem. Their model allowed a small number of one-way passes over input stored on a read-ony tape are possible. This paper is now regarded as one of the first works in the streaming model. I will recap the results in [MP78], and discuss two more recent results on this theme: approximate selection on a stream with both 'insertion' and 'deletion' records; and tight lower bounds for multi-pass median finding when the input data is randomly ordered.} }

@misc{TalkDagstuhl08, title = {Algorithms for Distributed Functional Monitoring}, note = {Talk at Dagstuhl Seminar on Sublinear Algorithms}, slides = {../slides/fmonitoring.pdf}, month = aug, year = 2008, key = {2008}, abstract = { We study what we call {\em functional monitoring} problems. We have $k$ players each receiving a stream of items, and communicating with a central coordinator. Let the multiset of items received by player $i$ up until time $t$ be $A_i(t)$. The coordinator's task is to monitor a given function $f$ computed over the union of the inputs $\cup_{i} A_i(t)$, {\em continuously} at all times $t$. The goal is to minimize the number of bits communicated between the players and the coordinator. Of interest is the approximate version where the coordinator outputs $1$ if $f \geq \tau$ and $0$ if $f\leq (1-\epsilon)\tau$. This defines the $(k,f,\tau,\epsilon)$ distributed, functional monitoring problem. Functional monitoring problems are fundamental in distributed systems, in particular sensor networks, where we must minimize communication; they also connect to problems in streaming algorithms, communication complexity, communication theory, and signal processing. Yet few formal bounds are known for functional monitoring. We give upper and lower bounds for the $(k,f,\tau,\epsilon)$ problem for some of the basic $f$'s. In particular, we study frequency moments ($F_0, F_1, F_2$). For $F_0$ and $F_1$, we obtain continuously monitoring algorithms with costs almost the same as their one-shot computation algorithms. However, for $F_2$ the monitoring problem seems much harder. We give a carefully constructed multi-round algorithm that uses ``sketch summaries'' at multiple levels of detail and solves the $(k,F_2,\tau,\epsilon)$ problem with communication ${O}~(k^2/\epsilon + k^{3/2}/\epsilon^3)$.} }

@misc{TalkBristol08, title = {Data Stream Algorithms}, note = {Tutorial at Bristol Summer School on Probabilistic Techniques in Computer Science.}, url = {../papers/bristol.pdf}, video = {http://www.cs.bris.ac.uk/probtcs08/videos-cormode.jsp}, slides = {../slides/bristol-all.ppt}, month = jul, year = 2008, key = {2008}, abstract = { Many scenarios, such as network analysis, utility monitoring, and financial applications, generate massive streams of data. These streams consist of millions or billions of simple updates every hour, and must be processed to extract the information described in tiny pieces. These notes provide an introduction to (and set of references for) {\em data stream algorithms}, and some of the techniques that have been developed over recent years to help mine the data while avoiding drowning in these massive flows of information. } }

@misc{CormodeGarofalakis06VLDB, author = {G. Cormode and M. Garofalakis}, att_authors = {gc2602}, att_private = {false}, title = {Streaming in a Connected World: Querying and Tracking Distributed Data Streams}, note = {Tutorial at VLDB 2006, SIGMOD 2007, EDBT 2008}, year = {2008}, month = {March}, url = {../papers/cdsummary.pdf}, video = {http://doi.acm.org/10.1145/1247480.1247649}, slides = {../slides/vldb06tut-slides.ppt}, abstract = { Today, a majority of data is fundamentally distributed in nature. Data for almost any task is collected over a broad area, and streams in at a much greater rate than ever before. In particular, advances in sensor technology and miniaturization have led to the concept of the sensor network: a (typically wireless) collection of sensing devices collecting detailed data about their surroundings. A fundamental question arises: how to query and monitor this rich new source of data? Similar scenarios emerge within the context of monitoring more traditional, wired networks, and in other emerging models such as P2P networks and grid-based computing. In all cases, if we can perform more computational work within the network to reduce the communication needed, then we can reduce bandwidth usage and improve performance. We consider two broad classes of approaches to such in-network query processing, by analogy to query types in traditional DBMSs. In the one shot model, a query is issued by a user at some site, and must be answered based on the current state of data in the network. We identify several possible approaches to this problem. For simple queries, partial computation of the result over a tree can reduce the data transferred significantly. For ``holistic'' queries, such as medians, count distinct and so on, clever composable summaries give a compact way to accurately approximate query answers. Lastly, careful modeling of correlations between measurements and other trends in the data can further reduce the number of sensors probed. In the continuous model, a query is placed by a user which requires the answer to be available continuously. This yields yet further challenges, since even using tree computation, summarization and modeling, we cannot afford to communicate every time new data is received by one of the remote sites. Instead, the result of work on this problem has been a new tradeoff of reduced accuracy in the query answer for reduced communication cost. This has led to a variety of techniques for different query types to apportion the available ``uncertainty'' in the query answer between different sites, and to model the evolution of measured values to anticipate future values and so reduce communication further. Our objective in this tutorial is to discuss the algorithmic foundations of this new world, illustrate some of the powerful techniques that have been developed to address these challenges, and outline interesting directions for future work in the area. Note: these slides are provided in powerpoint format for ease of use (due to embedded animations) and for use in other presentations. Please feel free to use these slides for your own purposes; we would appreciate acknowledgements and citations (to the VLDB 06 proceedings).} }

@misc{Cormode06Cluster, title = {Cluster and Data Stream Analysis}, note = {Tutorial at DIMACS Workshop on Data Mining and Epidemiology}, days = {24}, month = {March}, year = {2006}, key = {2006Cluster}, slides = {../slides/clustertutorial.pdf}, abstract = { Clustering is an important tool in machine learning and data mining. It allows features and correlations in the data to be identified and requires few parameters and little detailed information about the data. The results can be used to generate hypotheses, aid in visualization, or reduce the data to a few prototypical points. This 'unsupervised learning' technique has many variants and many perspectives. I will give an algorithmic view, describing some of the most popular clustering algorithms and identifying their pros and cons, including hierarchical clustering, k-means, expectation maximization (EM) and k-center approximation algorithms. When the input data is too large to conveniently hold in memory, or is being constantly updated, it is necessary to view the data as a massive stream. In recent years the ``data stream'' model has become a popular way to handle massive data sources. I will outline some of the key properties of data streams, and illustrate this with some of the recent work in clustering on data streams. } }

@misc{CormodeBertinoro06, title = {Biased Quantiles}, note = {Bertinoro}, key = {2006BQ}, month = {June}, year = {2006}, slides = {../slides/bquant-long.pdf}, abstract = { Skew is prevalent in data streams, and should be taken into account by algorithms that analyze the data. The problem of finding ``biased quantiles''--- that is, approximate quantiles which must be more accurate for more extreme values --- is a framework for summarizing such skewed data on data streams. We present the first deterministic algorithms for answering biased quantiles queries accurately with small---sublinear in the input size--- space and time bounds in one pass. The space bound is near-optimal, and the amortized update cost is close to constant, making it practical for handling high speed network data streams. We demonstrate that not only can we prove theoretical properties of the algorithm, but that it also uses less space than existing methods in many practical settings, and is fast to maintain.} }

@misc{TalkEntropy06, title = {Computing the Entropy of a stream}, key = {2006}, note = {AT\&T Labs; Bell Labs; DyDAn Center}, month = {December}, year = {2006}, url = {../slides/entropyslides.pdf}, abstract = { Recently there has been much interest in using empirical entropy and related measures to analyze behaviour in networks, and identify anomalies. At the heart of this is the basic problem of, given a sequence of tokens, how to efficiently compute the empirical entropy of the sequence efficiently, in small space, and with a single pass. I'll describe some work with Amit Chakrabarti and Andrew McGregor on a simple algorithm for computing the entropy of such a stream, and the corresponding lower bounds. I'll also discuss extensions to the `graph entropy', and why higher order entropy is harder to approximate.} }

@misc{TalkCSKanpur06, title = {A Compact Survey of Compressed Sensing}, key = {2006}, note = {Workshop on Algorithms for Data Streams, IIT Kanpur, India}, month = {December}, year = {2006}, url = {../slides/cs-kanpur.pdf}, link = {http://www.cse.iitk.ac.in/users/sganguly/revised-schedule.html}, abstract = {A short survey of some of the recent results under the banner of `compressed sensing', from a more algorithmic perspective.} }

@misc{TalkInvDbn05, title = {Tracking Inverse Distributions of Massive Data Streams}, key = {2005}, note = {Network Sampling Workshop in Paris, Bell Labs Research Seminar}, month = {July}, year = {2005}, url = {../slides/invdbn-slides.pdf}, url2 = {../slides/invdbn-paris.pdf}, abstract = { In order to maintain networks, defend against attacks, monitor service quality, and track usage patterns, the masive amounts of traffic must be processed to extract the necessary information. With data of this scale and speed, even basic analysis tasks become difficult. Most prior research has focused on questions on the ``forward distribution'' $f(x)$, such as ``what is the median connection duration?''. I will discuss a new set of challenges that come from studying the ``inverse distribution'' $f^-1(i)$, such as ``what is the median number of connections made by each user?'' Tracking the inverse distribution has a variety of applications in network monitoring, and requires some new techniques based on drawing an effective sample from the inverse distribution. I will talk about how to do this for a centralized monitoring point, for a pair of monitoring points, and finally for a general distributed monitoring architecture.} }

@misc{TalkDagstuhl05, title = {Towards an Algorithmic Theory of Compressed Sensing}, key = {2005}, note = {Schloss Dagstuhl}, month = {July}, year = {2005}, abstract = { In Approximation Theory, the fundamental problem is to reconstruct a signal $A \in R^n$ from linear measurements $$ with respect to a dictionary $\Psi$ for $R^n$. Recently, there has been tremendous excitement about the novel direction of {\em Compressed Sensing}~\cite{Donoho:04} where the reconstruction can be done with very few---$O(k log n)$---linear measurements over a modified dictionary $\Psi'$ if the information of the signal is concentrated in $k$ coefficients over an orthonormal basis $\Psi$. These results have reconstruction error on any given signal that is optimal with respect to a broad class of signals. In a series of papers and meetings over the past year, a theory of Compressed Sensing has been developed by mathematicians. We develop an algorithmic perspective for the Compressed Sensing problem, showing that Compressed Sensing results resonate with prior work in Group Testing, Learning theory and Streaming algorithms. Our main contributions are new algorithms that present the most general results for Compressed Sensing with $1+\epsilon$ approximation on {\em every} signal, faster algorithms for the reconstruction, as well as succinct transformations of $\Psi$ to $\Psi'$.} }

@misc{TalkSkewed05, title = {Summarizing and Mining Skewed Data Streams}, url = {../slides/skewed-slides.pdf}, key = {2005}, note = {NJIT}, month = {May}, year = {2005} }

@misc{TalkStrings00, title = {Short String Signatures}, key = {2000}, note = {DIMACS Workshop on Sublinear Algorithms in Princeton, NJ}, month = {September}, year = {2000}, url = {../slides/skewed-slides.pdf}, pubsite = {http://dimacs.rutgers.edu/Workshops/Sublinear/}, abstract = { When communication between two parties is the resource, sublinearity is vital to improve on the trivial solution of one party sending their data wholesale to the other. In the particular case of approximating distances between strings, sublinear ''signatures'' can be efficiently computed and exchanged. These signatures have applications beyoned communication scenarios, in particular in approximate nearest neighbor algorithms.} }

@misc{TalkEmbed02, title = {Embeddings of Metrics on Strings and Permuations}, key = {2002embed}, note = {Workshop on Discrete Metric Spaces and their Algorithmic Applications in Haifa, Israel; BCTCS}, year = {2002}, month = {March}, url = {../slides/dmsa.pdf}, pubsite = {http://cs.haifa.ac.il/~yuri/DMSA02.html} }

@misc{TalkText02, title = {Algorithmic Embeddings for Comparing Large Text Streams}, key = {2002text}, note = {CCR/DIMACS Workshop/Tutorial on Mining Massive Data Sets and Streams: Mathematical Methods and Algorithms for Homeland Defense}, month = {June}, year = {2002}, url = {../slides/mmdss.pdf}, pubsite = {http://dimacs.rutgers.edu/Workshops/Homeland/}, abstract = { Texts are ubiquitous in daily life, varying in size from small (SMS and email) to potentially immense (automatically generated reports, biological sequences). When scanning for similarities between new data and previously stored examples, we need a model that takes account of how texts are changed: pieces are inserted or deleted, and sections are moved around. With a large number of large texts, trying to compare all pairs is not feasible, and for the largest texts we may not even be able to hold the whole of any text in memory at once. We describe an approach to approximately comparing large texts quickly and in sublinear space. It relies on finding combinatorial structures present in any string of characters, and generating a vector representation. This allows rapid comparison of sequences based on a succint representation, and the application of clustering, nearest neighbor searching, and other data mining techniques.} }

@misc{TalkHot03, title = {Tracking Frequent Items Dynamically}, note = {Institute of Advanced Studies; DIMACS; Stonybrook; U. Pennsylvania}, year = {2003}, key = {2003hot}, url = {../slides/whatshot-long.pdf}, abstract = { Most database management systems maintain statistics on the underlying relation. One of the important statistics is that of the ''hot items'' in the relation: those that appear many times (most frequently, or more than some threshold). For example, end-biased histograms keep the hot items as part of the histogram and are used in selectivity estimation. Hot items are used as simple outliers in data mining, and in anomaly detection in networking applications. I will describe some novel solutions to this problem based on viewing this as a group testing problem. We give approaches using both adaptive and non-adaptive group testing. These algorithms maintain a small space data structure that monitors the transactions on the relation, and when required, quickly output all hot items, without rescanning the relation in the database. With user-specified probability, it is able to report all hot items. I will then go on to describe some work in progress on extensions to this model, when we wish to find items whose frequency has changed by a significant amount, either in absolute or relative terms; and also finding related items which together are ''hot'' but individually are not. } }

@misc{TalkL003, title = {Zeroing in on the L0 Metric}, month = {August}, year = {2003}, key = {2003L0}, note = { DIMACS Workshop on Discrete Metric Spaces and their Algorithmic Applications at Princeton}, url = {../slides/L0metricspaces.pdf}, pubsite = {http://dimacs.rutgers.edu/Workshops/MetricSpaces/}, abstract = { The L0 metric is defined to be the number of positions where two vectors differ. This can be challenging to build small space approximations for when the vectors represent dynamic data sets that are being updated (streaming fashion). The general solution presented here gives a highly flexible approximation scheme, which can be applied to other problems such as approximating the Dominance Norm, a measure of worst case influence, of very large numbers of signals.} }

@misc{TalkHotNew03, title = {What's Hot, What's Not, What's New and What's Next}, key = {2003new}, year = {2003}, month = {October}, note = { Bell Labs; DIMACS Mixer at AT\&T Labs.}, url = {../slides/cmhotnext.pdf}, pubsite = {http://dimacs.rutgers.edu/Events/2003/program-mix1.html}, abstract = { Algorithmic work on processing massive data streams has focussed on producing efficient summaries of streams to answer particular queries. The count-min sketch answers point, range and inner product queries, and can be used as the basis of algorithms to find approximate heavy hitters and quantiles. It improves space and update time over previous solutions for dynamic data streams and the analysis is quite simple. } }

@misc{TalkGames04, title = {How hard are computer games?}, key = {2004games}, year = {2004}, month = {February}, note = {Talk at DIMACS.}, url = {../slides/lemmingstalk.pdf}, abstract = { Computer Scientists often devote a great amount of time and effort in playing computer games. With the intention of legitimizing this endeavour in the name of academic research, I will focus on the algorithmic complexity of computer games. I'll begin by surveying some of the known results and outline the proof techniques, for such well known games as Minesweeper, Sokoban and Tetris. In the game of Lemmings, the player must guide a tribe of green-haired lemming creatures to safety, and save them from an untimely demise. I will describe and analyze the problem of deciding whether or not it is possible to complete a level of this game (and hence to find a solution to that level). Under certain limitations, this can be decided in polynomial time, but I will show that in general the problem is NP-Hard. I will outline my latest results, to show that this hardness holds even if there is only a single lemming to save, thus showing that it is hard to approximate the number of saved lemmings to any factor.} }

@misc{TalkNetwork04, title = {Algorithms for Processing Massive Data at Network Line Speed}, key = {2004net}, year = {2004}, month = {March}, note = {Talk at U. Iowa; U. Minnesota; Dartmouth; Google; AT\&T; CWRU; Poly}, url = {../slides/cmtalk.pdf}, abstract = {Many fundamental data mining problems gain additional complexity when applied to massive amounts data that is being generated at high speed. Networks are a driving force in massive data analysis, and require new approaches to produce the analyses network managers require. This motivates the data stream approach: using a small amount of space to process data very quickly in one pass. This has resulted in many strong and deep algorithmic results in this model of computation. But there is frequently a gap between theory and practice, since existing algorithms are too slow for typical high data rates by several orders of magnitude. I will describe my recent work aimed at bridging the gap. Firstly, I will describe the Count-Min sketch, which is a simple and hence fast data structure that can be used to approximate entries of a high dimensional vector in very small space. It can be used as a ''black box'' to solve several problems, including finding frequent items, and quantiles of the data distribution, and gives provable guarantees about their results. Secondly, I will describe and analyze a solution to the change detection problem, where the aim is to identify items (eg IP addresses) whose behavior has changed between two periods. Both these methods have been implemented and are running on network data at network line speeds with high accuracy. } }

@misc{TalkWDSA07, title = {Computational Fundamentals of Analyzing and Mining Data Streams}, key = {2007WDSA}, year = {2007}, note = {Tutorial at Workshop on Data Stream Analysis, Caserta, Italy}, month = {March}, url = {../papers/cormodewdsa.pdf}, slides = {../slides/streammining.pdf}, abstract = { Many scenarios, such as network analysis, utility monitoring, and financial applications, generate massive streams of data. These streams consist of millions or billions of simple updates every hour, and must be processed to extract the information described in tiny pieces. This survey will introduce the problems of data stream monitoring, and some of the techniques that have been developed over recent years to help find the nuggets of gold while avoiding drowning in these massive streams of information. In particular, this talk will introduce the fundamental techniques used to create compact summaries of data streams: sampling, sketching, and other synopsis techniques. It will describe how to extract features from the stream such as frequent items, medians, and association rules. Lastly, we will see methods to detect when and how the process generating the stream is evolving, indicating some important change has occurred. } }

@misc{TalkGraphstream09, title = {Processing Graph Streams: Upper and Lower Bounds}, key = {2009graph}, year = 2009, month = jun, note = {Talk at Workshop on Algorithms and Models for Complex Networks, Bristol UK}, slides = {../slides/graphsbristol.pdf}, abstract = { In the graph streaming model of computation, we see the input graph presented as a sequence of edges, in some arbitrary order, and must perform computations with one or few passes over this input. Crucial parameters are the amount of working memory needed, and the time per update. In this talk, I'll discuss some of the key results in this youthful field. These include: \begin{itemize} \item lower bounds on the memory needed for fundamental computations (testing connectedness and bipartiteness) \item very small memory algorithms for approximating properties of the degree sequence and counting small subgraphs \item constructing spanners and matchings with memory proportional to the number of nodes but sublinear in the number of edges \item extensions to cases where each edge may be duplicated multiple times \end{itemize}} }

@misc{TalkFreq09, title = {Finding Frequent Items in Data Streams}, key = {2009Freq}, year = 2009, note = {Talk at DIMACS Working group on Streaming, Coding and Compressive Sensing; AT\&T Labs; UMass Amherst; Dartmouth College}, month = {March}, slides = {../slides/freqslides.pdf}, abstract = { The frequent items problem is to process a stream of items and find all items occurring more than a given fraction of the time. It is one of the most heavily studied problems in data stream mining, dating back to the 1980s. Many applications rely directly or indirectly on finding the frequent items, and implementations are in use in large scale industrial systems. In this talk, I survey the problem, and present the most important algorithm in a common framework. I'll also show some experimental comparisons, and discuss some more recent results which explain why some of these algorithms give even better results than their initial analysis led us to believe. } }

@misc{TalkWeb207, title = {Analyzing Web 2.0, Blogs and Social Networks}, key = {2007Web}, year = 2007, month = dec, note = {Talk at AT\&T Labs} }

@misc{TalkBayesPriv14, key = {2014Bayes}, year = 2014, note = {Talk at Hamilton Institute; Edinburgh University; Yahoo! Research, New York}, month = {March}, title = {Differentially Private Mechanisms for Data Release}, abstract = { Many users are clamouring for privacy techniques so that they can publish, share or sell their data safely. There are many aspects of data that then have to be managed: the data is often sparse, multi-dimensional, and structured. Each of these requires mechanisms that are aware of these features, so that they provide output which is useful and efficient to compute, in addition to providing a privacy guarantee. The state-of-the-art goal for this problem is differential privacy, which offers a strong degree of privacy protection without making restrictive assumptions about the adversary. Existing techniques using differential privacy, however, cannot effectively handle the publication of high-dimensional data. In particular, when the input dataset contains a large number of attributes, existing methods require injecting a prohibitive amount of noise compared to the signal in the data, which renders the published data next to useless. This talk outlines some recent efforts to generate practical mechanisms for high-dimensional data release. The key insight is the need to build a compact model for the data which accurately represents its structure, while remaining resilient to the noise introduced to preserve privacy. Models based on Bayesian networks turn out to be particularly effective for this task. However, private construction of Bayesian networks turns out to be significantly challenging. This leads to the introduction of leading to PrivBayes, a novel approach to identify correlations among attributes and release suitable marginals while ensuring differential privacy. Experimental evaluations of PrivBayes on real data, demonstrate that it significantly outperforms existing solutions in terms of accuracy.} }

*This file was generated by
bibtex2html 1.92.*