@comment{{This file has been generated by bib2bib 1.92}}

@comment{{Command line: C:\alldata\data\webbib\bib2bib-1.92.exe -oc other -ob other.bib -c '($type = "TECHREPORT" | $type = "UNPUBLISHED" | $type="INCOLLECTION")' cormode.bib}}

@techreport{CormodeJowhariMonemizadehMuthukrishnan16, author = {Graham Cormode and Hossein Jowhari and Morteza Monemizadeh and S. Muthukrishnan}, title = {The Sparse Awakens: Streaming Algorithms for Matching Size Estimation in Sparse Graphs}, year = 2016, key = {sparse16}, institution = {ArXiV}, link = {http://arxiv.org/abs/1608.03118}, abstract = { Estimating the size of the maximum matching is a canonical problem in graph algorithms, and one that has attracted extensive study over a range of different computational models. We present improved streaming algorithms for approximating the size of maximum matching with sparse (bounded arboricity) graphs. * {\em (Insert-Only Streams)} We present a one-pass algorithm that takes $O(c\log^2 n)$ space and approximates the size of the maximum matching in graphs with arboricity $c$ within a factor of $O(c)$. This improves significantly upon the state-of-the-art ${O}\sim(cn^{2/3})$-space streaming algorithms. * {\em (Dynamic Streams)} Given a dynamic graph stream (i.e., inserts and deletes) of edges of an underlying $c$-bounded arboricity graph, we present an one-pass algorithm that uses space $O(c^{10/3}n^{2/3})$ and returns an $O(c)$-estimator for the size of the maximum matching. This algorithm improves the state-of-the-art ${O}\sim(cn^{4/5})$-space algorithms, where the ${O}\sim(.)$ notation hides logarithmic in $n$ dependencies. * {\em (Random Order Streams)} For randomly ordered graph streams, we present a two-pass streaming algorithm that outputs a $(1+\epsilon)$-approximation of the maximal matching size using $O(d^{d^{d+6}})$ space for graphs with maximum degree bounded by $d$. The previous best algorithm for this setting is an $O(\log d)$ approximation. Our algorithm is the first with an approximation guarantee independent of $d$. Our result is obtained by simulating a property-testing result in a streaming setting, a technique which might be of independent interest. In contrast to the previous works, our results take more advantage of the streaming access to the input and characterize the matching size based on the ordering of the edges in the stream in addition to the degree distributions and structural properties of the sparse graphs. } }

@incollection{CormodeCM15, author = {Graham Cormode}, title = {Encyclopedia entry on 'Count-Min Sketch' }, editor = {Ming-Yang Kao}, year = 2015, url = {../papers/encalgs-cm.pdf}, pubsite = {http://link.springer.com/referenceworkentry/10.1007%2F978-3-642-27848-8_579-1}, booktitle = {Encyclopedia of Algorithms}, publisher = {Springer} }

@incollection{CormodeAMS15, author = {Graham Cormode}, title = {Encyclopedia entry on 'AMS Sketch' }, editor = {Ming-Yang Kao}, year = 2015, url = {../papers/encalgs-ams.pdf}, pubsite = {http://link.springer.com/referenceworkentry/10.1007%2F978-3-642-27848-8_578-1}, booktitle = {Encyclopedia of Algorithms}, publisher = {Springer} }

@incollection{CormodeMG15, author = {Graham Cormode}, title = {Encyclopedia entry on 'Misra-Gries Summary' }, editor = {Ming-Yang Kao}, year = 2015, url = {../papers/encalgs-mg.pdf}, pubsite = {http://link.springer.com/referenceworkentry/10.1007%2F978-3-642-27848-8_572-1}, booktitle = {Encyclopedia of Algorithms}, publisher = {Springer} }

@incollection{Cormode11sketch, title = {Sketch Techniques for Massive Data}, author = {Graham Cormode}, booktitle = {Synposes for Massive Data: Samples, Histograms, Wavelets and Sketches}, att_authors = {gc2602}, att_private = {false}, editor = {Graham Cormode and Minos Garofalakis and Peter Haas and Chris Jermaine}, series = {Foundations and Trends in Databases}, publisher = {NOW publishers}, year = 2011, url = {../papers/sk.pdf}, abstract = {Sketch techniques have undergone extensive development within the past few years. They are especially appropriate for the data streaming scenario, in which large quantities of data flow by and the the sketch summary must continually be updated quickly and compactly. Sketches, as presented here, are designed so that the update caused by each new piece of data is largely independent of the current state of the summary. This design choice makes them faster to process, and also easy to parallelize. ``Frequency based sketches'' are concerned with summarizing the observed frequency distribution of a dataset. From these sketches, accurate estimations of individual frequencies can be extracted. This leads to algorithms to find the approximate heavy hitters (items which account for a large fraction of the frequency mass) and quantiles (the median and its generalizations). The same sketches are also used to estimate (equi)join sizes between relations, self-join sizes and range queries. These can be used as primitives within more complex mining operations, and to extract wavelet and histogram representations of streaming data. A different style of sketch construction leads to sketches for distinct-value queries. As mentioned above, using a sample to estimate the answer to a COUNT DISTINCT query does not give accurate results. In contract, sketching methods which can make a pass over the whole data can provide guaranteed accuracy. Once built, these sketches estimate not only the cardinality of a given attribute or combination of attributes, but also the cardinality of various operations performed on them, such as set operations (union and difference), and selections based on arbitrary predicates.} }

@incollection{Cormode11CD, author = {Graham Cormode}, title = {Continuous Distributed Monitoring: A Short Survey}, att_authors = {gc2602}, att_private = {false}, publisher = {ACM}, booktitle = {Paper accompanying invited talk at Algorithms and Models for Distributed Event Processing (AlMoDEP)}, month = sep, year = 2011, url = {../papers/cdsurvey.pdf}, slides = {../slides/cdsurvey.pdf}, abstract = {In the model of continuous distributed monitoring, a number of observers each see a stream of observations. Their goal is to work together to compute a function of the union of their observations. This can be as simple as counting the total number of observations, or more complex non-linear functions such as tracking the entropy of the induced distribution. Assuming that it is too costly to simply centralize all the observations, it becomes quite challenging to design solutions which provide a good approximation to the current answer, while bounding the communication cost of the observers, and their other resources such as their space usage. This survey introduces this model, and describe a selection results in this setting, from the simple counting problem to a variety of other functions that have been studied.} }

@incollection{BhagatCormodeMuthukrishnan11, author = {S. Bhagat and Graham Cormode and S. Muthukrishnan}, title = {Node Classification in Social Networks}, booktitle = {Social Network Data Analytics}, editor = {Charu C. Aggarwal}, publisher = {Springer}, link = {http://arxiv.org/abs/1101.3291}, url = {../papers/graphlabelchapter.pdf}, year = 2011, abstract = { When dealing with large graphs, such as those that arise in the context of online social networks, a subset of nodes may be labeled. These labels can indicate demographic values, interest, beliefs or other characteristics of the nodes (users). A core problem is to use this information to extend the labeling so that all nodes are assigned a label (or labels). In this chapter, we survey classification techniques that have been proposed for this problem. We consider two broad categories: methods based on iterative application of traditional classifiers using graph information as features, and methods which propagate the existing labels via random walks. We adopt a common perspective on these methods to highlight the similarities between different approaches within and across the two categories. We also describe some extensions and related directions to the central problem of node classification. } }

@techreport{CormodeThalerYi10, author = {Graham Cormode and Justin Thaler and Ke Yi}, title = {Verifying Computations with Streaming Interactive Proofs}, institution = {Electronic Colloquium on Computational Complexity (ECCC)}, volume = {17}, year = {2010}, number = {TR10-159}, pubsite = {http://eccc.hpi-web.de/report/2010/159} }

@incollection{Cormode09b, title = {Encyclopedia Entry on {'Count-Min Sketch'}}, author = {Graham Cormode}, att_authors = {gc2602}, att_private = {false}, pages = {511-516}, year = {2009}, editor = {Ling Liu and M. Tamer Ozsu}, url = {../papers/cmencyc.pdf}, pubsite = {http://www.springerlink.com/content/g8m2427656n4npp8/}, booktitle = {Encyclopedia of Database Systems}, publisher = {Springer}, abstract = { The Count-Min (CM) Sketch is a compact summary data structure capable of representing a high-dimensional vector and answering queries on this vector, in particular point queries and dot product queries, with strong accuracy guarantees. Such queries are at the core of many computations, so the structure can be used in order to answer a variety of other queries, such as frequent items (heavy hitters), quantile finding, join size estimation, and more. Since the data structure can easily process updates in the form of additions or subtractions to dimensions of the vector (which may correspond to insertions or deletions, or other transactions), it is capable of working over streams of updates, at high rates. The data structure maintains the linear projection of the vector with a number of other random vectors. These vectors are defined implicitly by simple hash functions. Increasing the range of the hash functions increases the accuracy of the summary, and increasing the number of hash functions decreases the probability of a bad estimate. These tradeoffs are quantified precisely below. Because of this linearity, CM sketches can be scaled, added and subtracted, to produce summaries of the corresponding scaled and combined vectors. } }

@techreport{Cormode10, author = {Graham Cormode}, title = {Individual Privacy vs Population Privacy: Learning to Attack Anonymization}, institution = {arXiv}, pubsite = {http://arxiv.org/abs/1011.2511}, number = {arXiv:1011.2511}, year = 2010, abstract = { Over the last decade there have been great strides made in developing techniques to compute functions privately. In particular, Differential Privacy gives strong promises about conclusions that can be drawn about an individual. In contrast, various syntactic methods for providing privacy (criteria such as kanonymity and l-diversity) have been criticized for still allowing private information of an individual to be inferred. In this report, we consider the ability of an attacker to use data meeting privacy definitions to build an accurate classifier. We demonstrate that even under Differential Privacy, such classifiers can be used to accurately infer ``private'' attributes in realistic data. We compare this to similar approaches for inferencebased attacks on other forms of anonymized data. We place these attacks on the same scale, and observe that the accuracy of inference of private attributes for Differentially Private data and l-diverse data can be quite similar.} }

@techreport{CormodeGarofalakis06, author = {Graham Cormode and Minos Garofalakis}, title = {Histograms and Wavelets on Probabilistic Data}, institution = {arXiv}, pubsite = {http://arxiv.org/abs/0806.1071}, number = {arXiv:0806.1071}, year = 2008, abstract = { There is a growing realization that uncertain information is a first-class citizen in modern database management. As such, we need techniques to correctly and efficiently process uncertain data in database systems. In particular, data reduction techniques that can produce concise, accurate synopses of large probabilistic relations are crucial. Similar to their deterministic relation counterparts, such compact probabilistic data synopses can form the foundation for human understanding and interactive data exploration, probabilistic query planning and optimization, and fast approximate query processing in probabilistic database systems. In this paper, we introduce definitions and algorithms for building histogram- and wavelet-based synopses on probabilistic data. The core problem is to choose a set of histogram bucket boundaries or wavelet coefficients to optimize the accuracy of the approximate representation of a collection of probabilistic tuples under a given error metric. For a variety of different error metrics, we devise efficient algorithms that construct optimal or near optimal B-term histogram and wavelet synopses. This requires careful analysis of the structure of the probability distributions, and novel extensions of known dynamic-programming-based techniques for the deterministic domain. Our experiments show that this approach clearly outperforms simple ideas, such as building summaries for samples drawn from the data distribution, while taking equal or less time.} }

@techreport{CormodeKornTirthapura07, author = {G. Cormode and F. Korn and S. Tirthapura}, title = {Time Decaying Aggregates in Out-of-order streams}, year = 2007, number = {2007-10}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, link = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2007/2007-10.html}, abstract = { Processing large data streams is now a major topic in data management. The data involved can be truly massive, and the required analyses complex. In a stream of sequential events such as stock feeds, sensor readings, or IP traffic measurements, tuples pertaining to recent events are typically more important than older ones. This can be formalized via time decay functions, which assign weights based on age. Decay functions such as sliding windows and exponential decay have been well studied under the assumption of well-ordered arrivals, i.e., data arrives in the order of increasing time stamps. However, data quality issues are prevalent in massive streams (due to network asynchrony and delays or possibly due to features inherent to the measurement process), and correct order of arrival cannot be guaranteed. We focus on the computation of decayed aggregates such as range queries, quantiles, and heavy hitters on out-of-order streams, where elements do not necessarily arrive in increasing order of timestamps. We give the first deterministic algorithms for approximating these aggregates under the sliding window, exponential and polynomial decay functions. Our techniques are based on extending existing data stream summaries, such as the q-digest. We show that the overhead for allowing out-of-order arrivals is small when compared to the case of well-ordered arrivals. Our experimental study confirms that these methods can be applied in practice and investigates how the specific decay function impacts performance. } }

@techreport{CormodeMuthukrishnan05CS2, author = {G. Cormode and S. Muthukrishnan}, title = {Combinatorial Algorithms for Compressed Sensing}, att_authors = {gc2602}, att_private = {false}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, year = 2005, number = {2005-40}, url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2005/2005-40.pdf}, link = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2005/2005-40.html}, abstract = { In sparse approximation theory, the fundamental problem is to reconstruct a signal A \in R^n from linear measurements with respect to a dictionary of psi_i's. Recently, there is focus on the novel direction of Compressed Sensing where the reconstruction can be done with very few---O(k log n)---linear measurements over a modified dictionary if the signal is compressible, that is, its information is concentrated in k coefficients. In particular, these results prove that there exists a single O(k log n) x n measurement matrix such that any such signal can be reconstructed from these measurements, with error at most O(1) times the worst case error for the class of such signals. Compressed sensing has generated tremendous excitement both because of the sophisticated underlying Mathematics and because of its potential applications. In this paper, we address outstanding open problems in Compressed Sensing. Our main result is an explicit construction of a non-adaptive measurement matrix and the corresponding reconstruction algorithm so that with number of measurements polynomial in k, log n, 1/$\epsilon$, we can reconstruct any compressible signal upto 1+epsilon error. This is the first known polynomial time explicit construction of any such measurement matrix. Our result not only improves the error guarantee from O(1) to 1 + $\epsilon$ but also improve the reconstruction time from poly(n) to poly(k log n). Our second result is a randomized construction of a measurement matrix of size O(k polylog n) that works for each signal with high probability and gives per-instance approximation guarantee rather than over the class of all signals. Previous work on Compressed Sensing does not provide such per-instance approximation guarantee; our result improves the best known number of measurements known from prior work in other areas including Learning Theory Streaming algorithms and Complexity Theory for this case. Our approach is combinatorial. In particular, we use two parallel sets of group tests, one to filter and the other to certify and estimate; the resulting algorithms are quite simple to implement. } }

@techreport{CormodeMuthukrishnan05CS, author = {G. Cormode and S. Muthukrishnan}, title = {Towards an Algorithmic Theory of Compressed Sensing}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, year = 2005, number = {2005-25}, url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2005/2005-25.pdf}, link = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2005/2005-25.html}, abstract = { In Approximation Theory, the fundamental problem is to reconstruct a signal A in R^n from linear measurements with respect to a dictionary Psi for R^n. Recently, there has been tremendous excitement about the novel direction of Compressed Sensing [Donoho 04] where the reconstruction can be done with very few---O(k log n)---linear measurements over a modified dictionary Psi' if the information of the signal is concentrated in k coefficients over an orthonormal basis Psi. These results have reconstruction error on any given signal that is optimal with respect to a broad class of signals. In a series of papers and meetings over the past year, a theory of Compressed Sensing has been developed by mathematicians. We develop an algorithmic perspective for the Compressed Sensing problem, showing that Compressed Sensing results resonate with prior work in Group Testing, Learning theory and Streaming algorithms. Our main contributions are new algorithms that present the most general results for Compressed Sensing with (1+$\epsilon$) approximation on every signal, faster algorithms for the reconstruction, as well as succinct transformations of Psi to Psi'. } }

@incollection{CormodeMuthukrishnan06CISS, author = {G. Cormode and S. Muthukrishnan}, title = {Combinatorial Algorithms for Compressed Sensing}, booktitle = {Proceedings of Conference on Information Sciences and Systems ({CISS})}, year = {2006}, note = {Invited submission}, slides = {../slides/cs-cissslides.pdf}, url = {../papers/cs-ciss.pdf}, link = {http://www288.pair.com/ciss/ciss/numbered/321.pdf}, abstract = { In sparse approximation theory, the fundamental problem is to reconstruct a signal $A \in R^n$ from linear measurements $< A,{\psi_i}>$ with respect to a dictionary of $\psi_i$'s. Recently, there is focus on the novel direction of {\em Compressed Sensing} where the reconstruction can be done with very few---$O(k \log n)$---linear measurements over a modified dictionary if the signal is {\em compressible}, that is, its information is concentrated in $k$ coefficients with the original dictionary. In particular, these results prove that there exists a single $O(k\log n) \times n$ measurement matrix such that any such signal can be reconstructed from these measurements, with error at most $O(1)$ times the worst case error for the class of such signals. Compressed sensing has generated tremendous excitement both because of the sophisticated underlying Mathematics In this paper, we address outstanding open problems in Compressed Sensing. Our main result is an explicit construction of a non-adaptive measurement matrix and the corresponding reconstruction algorithm so that with a number of measurements polynomial in $k, \log n, 1/\varepsilon$, we can reconstruct compressible signals. This is the first known polynomial time explicit construction of any such measurement matrix. In addition, our result improves the error guarantee from $O(1)$ to $1 + \varepsilon$ and improves the reconstruction time from $poly(n)$ to $poly(k\log n)$. Our second result is a randomized construction of $O(k polylog(n))$ measurements that work for each signal with high probability and gives per-instance approximation guarantees rather than over the class of all signals. Previous work on Compressed Sensing does not provide such per-instance approximation guarantees; our result improves the best known number of measurements known from prior work in other areas including Learning Theory, Streaming algorithms and Complexity Theory for this case. Our approach is combinatorial. In particular, we use two parallel sets of group tests, one to filter and the other to certify and estimate; the resulting algorithms are quite simple to implement.} }

@incollection{CormodeGarofalakis05DEB, author = {G. Cormode and M. Garofalakis}, title = {Efficient Strategies for Continuous Distributed Tracking Tasks}, att_authors = {gc2602}, att_private = {false}, publisher = {IEEE}, booktitle = {IEEE Data Engineering Bulletin}, pages = {33--39}, month = {March}, year = {2005}, url = {../papers/cdtt.pdf}, note = {}, abstract = { While traditional databases have focused on single query evaluation in a centralized setting, emerging applications require {\em continuous} tracking of queries on data that is widely {\em distributed} and constantly updated. We describe such scenarios, and describe the challenges involved in designing communication-efficient protocols for the tracking tasks we define. We outline some solutions to these problems, by abstracting a model of the communication system, defining the tracking tasks of interest, and building query-tracking schemes based on three guiding principles of minimizing global information, using summaries to capture whole data streams, and seeking stability of the protocols.} }

@techreport{CormodeMuthukrishnanRozenbaum05, author = {G. Cormode and S. Muthukrishnan and I. Rozenbaum}, title = {Summarizing and Mining Inverse Distributions on Data Streams via Dynamic Inverse Sampling}, number = {2005-11}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, url = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2005/2005-11.html}, year = {2005}, abstract = { Database management systems face the challenge of dealing with massive data distributions which arrive at high speeds while there is only small storage available for managing and mining them. Emerging data stream management systems approach this problem by summarizing and mining the distributions using samples or sketches. However, data distributions can be ``viewed'' in different ways. For example, a data stream of integer values can be viewed either as the {\em forward} distribution $f(x)$, ie., the number of occurrences of $x$ in the stream, or as its inverse, $f^{-1}(i)$, which is the number of items that appear $i$ times. While both such ``views'' are equivalent in stored data systems, over data streams that entail approximations, they may be significantly different. In other words, samples and sketches developed for the forward distribution may be ineffective for summarizing or mining the inverse distribution. Yet, many applications such as IP traffic monitoring naturally rely on mining inverse distributions. We formalize the problems of managing and mining inverse distributions and show provable differences between summarizing the forward distribution vs the inverse distribution. We present methods for summarizing and mining inverse distributions of data streams: they rely on a novel technique to maintain a dynamic sample over the stream with provable guarantees which can be used for variety of summarization tasks (building quantiles or equidepth histograms) and mining (anomaly detection: finding heavy hitters, and measuring the number of rare items), all with provable guarantees on quality of approximations and time/space used by our streaming methods. We also complement our analytical and algorithmic results by presenting an experimental study of the methods over network data streams.} }

@incollection{Cormode05, author = {G. Cormode}, att_authors = {gc2602}, att_private = {false}, title = {Some Key Concepts in Data Mining -- Clustering}, year = {2006}, booktitle = {Discrete Methods in Epidemiology}, publisher = {AMS}, series = {DIMACS}, pages = {2--9}, volume = {70}, url = {../papers/epidcluster.pdf}, link = {http://www.ams.org/bookstore-getitem/item=dimacs-70} }

@incollection{CormodeGarofalakis05, editor = {M. Garofalakis and J. Gehrke and R. Rastogi}, booktitle = {Data Stream Management: Processing High-Speed Data Streams}, publisher = {Springer}, year = {2016}, author = {G. Cormode and M. Garofalakis}, title = {Join Sizes, Frequency Moments, and Applications}, url = {../papers/ams.pdf}, link = {http://www.springer.com/gb/book/9783540286073}, abstract = {In this chapter, we focus on a set of problems chiefly inspired by the problem of estimating the join size between two database relations. This problem is at the heart of a wide variety of other problems, both in databases and beyond. Given two relations, $R$ and $S$, and an attribute $a$ common to both relations, the {\em equi-join} between the two relations, $R$ and $S$, consists of one tuple for each $r \in R$ and $s \in S$ pair such that $r.a = s.a$. Estimating the joinsize between pairs of relations is a key component in planning how to answer a complex SQL query that may contain arbitrary combinations of selection, projection and join tasks. The result can also be used to answer some queries approximately. This applies directly in a streaming context, and additionally within a traditional database management system that is operating on the stream of updates generated by tuple insertions and deletions. Knowing the join size is also a important problem at the heart of a variety of other problems, such as building histogram and wavelet representations of data, and can be applied to problems such as finding frequent items and quantiles, and modelling spatial and high-dimensional data.} }

@incollection{CormodeIndyk05, editor = {M. Garofalakis and J. Gehrke and R. Rastogi}, booktitle = {Data Stream Management: Processing High-Speed Data Streams}, year = {2016}, publisher = {Springer}, link = {../papers/stablechapter.pdf}, author = {G. Cormode and P. Indyk}, title = {Stable Distributions in Streaming Computations}, abstract = { In many streaming scenarios, we need to measure and quantify the data that is seen. For example, we may want to measure the number of distinct IP addresses seen over the course of a day, compute the difference between incoming and outgoing transactions in a database system or measure the overall activity in a sensor network. More generally, we may want to cluster readings taken over periods of time or in different places to find patterns, or find the most similar signal from those previously observed to a new observation. For these measurements and comparisons to be meaningful, they must be well-defined. Here, we will use the well-known and widely used $L_p$ norms. These encompass the familiar Euclidean (root of sum of squares) and Manhattan (sum of absolute values) norms. In this chapter, we will show how $L_p$ distances can be estimated for massive vectors presented in the streaming model. This is achieved by making succinct {\em sketches} of the data, which can be used as synopses of the vectors they summarize.} }

@techreport{Cormode04LemsTR, author = {G. Cormode}, title = {The Hardness of the Lemmings Game, or, ``{Oh} no, more {NP-C}ompleteness Proofs''}, year = {2004}, number = {2004-11}, link = {Cormode04.html}, pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2004/2004-11.html}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, abstract = { In the computer game `Lemmings', the player must guide a tribe of green-haired lemming creatures to safety, and save them from an untimely demise. We formulate the decision problem which is, given a level of the game, to decide whether it is possible to complete the level (and hence to find a solution to that level). Under certain limitations, this can be decided in polynomial time, but in general the problem is shown to be NP-Hard. This can hold even if there is only a single lemming to save, thus showing that it is hard to approximate the number of saved lemmings to any factor.} }

@techreport{AbelloCormode04, author = {J. Abello and G. Cormode}, title = {Report on {DIMACS} Working Group Meeting: Data Mining and Epidemiology, {M}arch 18-19, 2004}, url = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2004/2004-37.html}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, number = {2004-37}, year = {2004}, abstract = { Epidemiology is an observational science that concerns itself with finding and explaining patterns of health and disease in populations, usually of humans, but also populations of animals, insects and plants. Data mining is an active area of research interested in finding algorithms for describing latent patterns in often very large data sets. This Working Group had the objective of fostering collaboration between these two disciplines. In March of 2004 it organized a two-day meeting at DIMACS to bring these two groups together in a format designed to initiate such collaborations.} }

@techreport{CormodeCzumajMuthukrishnan04TR, author = {G. Cormode and A. Czumaj and S. Muthukrishnan}, title = {How to {\em increase} the acceptance ratios of top conferences}, url = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2004/2004-12.html}, number = {2004-12}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, year = {2004}, link = {CormodeCzumajMuthukrishnan04.html}, abstract = { In the beginning was the pub. This work was triggered by a pub conversation where the authors observed that many resumes list acceptance ratios of conferences where their papers appear, boasting the low acceptance ratio. The lower the ratio, the better your paper looks. The list might look equally impressive if one listed the rejection ratio of conferences where ones paper was submitted and rejected. We decided to lampoon rather than lament the effort the PC typically put in: wouldnt the world be better if we could encourage only high quality submissions and so run top conferences with very high acceptance ratios? This paper captures our thoughts, and it is best consumed in a pub (and in color).} }

@techreport{CormodeMuthukrishnan03RadialTR, author = {G. Cormode and S. Muthukrishnan}, title = {Radial Histograms for Spatial Streams}, year = {2003}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, number = {2003-11}, url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2003/2003-11.ps.gz}, pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2003/2003-11.html}, abstract = { Many data streams relate to geographic or spatial information such as the tracking of moving objects, or location-specific measurements and queries. While several techniques have been developed recently for processing numerical, text or XML streams, new techniques are needed to process spatial queries on streaming geometric data. We propose a novel data structure, the Radial Histogram, to process streams including spatial data points. It allows a number of geometric aggregates involving the spread and extent of the points in the data streams---diameter, furthest neighbors, convex hulls---to be accurately estimated to arbitrary precision. By using multiple Radial Histograms for a set of given facilities, we can process data streams consisting of massive numbers of client points. We can then accurately estimate number of spatial aggregates involving relative placement of clients with respect to the facilities, including reverse nearest neighbors, spatial joins and more. Radial histograms use very small space and are exceedingly simple to implement. Nevertheless, we prove that they guarantee accurate estimation for spatial aggregate queries mentioned above. An experimental evaluation shows them to perform very well in practice.} }

@techreport{CormodeMuthukrishnan01EditTR, author = {G. Cormode and S. Muthukrishnan}, title = {The String Edit Distance Matching Problem with Moves}, url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2001/2001-26.ps.gz}, pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2001/2001-26.html}, number = {2001-26}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, link = {CormodeMuthukrishnan02.html}, year = {2001}, abstract = { The edit distance between two strings S and R is defined to be the minimum number of character inserts, deletes and changes needed to convert R to S. Given a text string t of length n, and a pattern string p of length m, informally, the string edit distance matching problem is to compute the smallest edit distance between p and substrings of t. A well known dynamic programming algorithm takes time O(nm) to solve this problem, and it is an important open problem in Combinatorial Pattern Matching to significantly improve this bound. We relax the problem so that (a) we allow an additional operation, namely, substring moves, and (b) we approximate the string edit distance upto a factor of O(log n log* n).(log*n is the number of times log function is applied to n to produce a constant.) Our result is a neat linear time deterministic algorithm for this version of the problem. This is the first known significantly subquadratic algorithm for a string edit distance problem in which the distance involves nontrivial alignments. Our results are obtained by embedding strings into L_1 vector space using a simplified parsing technique we call Edit Sensitive Parsing (ESP). This embedding is approximately distance preserving, and we show many applications of this embedding to string proximity problems including nearest neighbors, outliers, and streaming computations with strings. } }

@techreport{CormodeMuthukrishnan02DomTR, author = {G. Cormode and S. Muthukrishnan}, title = {Estimating Dominance Norms of Multiple Data Streams}, institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})}, link = {CormodeMuthukrishnan03DN.html}, year = {2002}, number = {2002-35}, url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2002/2002-35.ps.gz}, pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2002/2002-35.html}, abstract = { Data streams often consist of multiple signals. Consider a stream of multiple signals (i,a_i,j) where i's correspond to the domain, j's index the different signals and a_i,j >= 0 to the value of the j'th signal at point i. We study the problem of determining the dominance norms over the multiple signals, in particular the max-dominance norm, defined as sum_i max_j {a_i,j}. It is used in applications to estimate the ``worst case influence'' of multiple processes, for example in IP traffic analysis, electrical grid monitoring and financial domain. Besides finding many applications, it is a natural measure: it generalizes the notion of union of data streams and may be alternately thought of as estimating the L1 norm of the upper envelope of multiple signals. We present the first known data stream algorithm for estimating max-dominance of multiple signals. In particular, we use workspace and time-per-item that are both sublinear (in fact, poly-logarithmic) in the input size. The algorithm is simple and implementable; its analysis relies on using properties of stable random distributions with small parameter alpha, which may be a technique of independent interest. In contrast we show that other dominance norms -- min-dominance sum_i min_j {a_i,j}, count-dominance (|{i | a_i > b_i}|) or relative-dominance (sum_i a_i/max{1,b_i}), are all impossible to estimate accurately with sublinear space. } }

@unpublished{Cormode99, title = {Topic Dependencies for Electronic Books}, author = {G. Cormode}, year = {1999}, note = {(unpublished manuscript)}, url = {../papers/eb.pdf}, abstract = { Electronics books consist of a number of topics, and information about dependencies between topics. We examine some theoretical problems related to handling these topic dependencies. In particular we consider the case where these topic dependencies are acyclic, and nondecomposable (that is, if topic a and b together require c, it is not necessarily the case that a or b alone require c). We formalise the semantics of this system, and give an axiomatisation which we show is sound and complete. We then show that we can easily compute the closure of a set of all topics in this model, which is the set of topics which are required by that set. This closure is computed in an identical manner to that for other formalisations of topic dependencies, showing that for this purpose no distinction need be drawn. We then consider a dierent kind of closure, where given a set of desired topics, we must nd a subset such that the total running time of all topics in its closure is less than a given time bound. We show that this problem is NP-complete. We analyse some proposed heuristics to approximate the solution of this problem and demonstrate that inputs can be engineered which elicit very poor performance from these heuristics, showing that no factor of approximation can be guaranteed for them. Finally, we transform the problem into a problem over a bipartite graph, which in turn can be formulated as a problem in mathematical integer programming. Hardness results for this problem give us condence that a guaranteed approximation is unlikely to exist.} }

@unpublished{Cormode98, author = {G. Cormode}, title = {Springs and Sound Layouts}, year = {1998}, pubsite = {../../images/web.gif}, url = {../papers/sasl.pdf}, note = {(unpublished manuscript)}, abstract = {An empirical investigation of graph drawing algorithms performed as a final year project.} }

@incollection{CormodeWDSA07, title = {Computational Fundamentals of Analyzing and Mining Data Streams}, author = {G. Cormode}, year = {2007}, booktitle = {Workshop on Data Stream Analysis}, url = {../papers/cormodewdsa.pdf}, link = {TalkWDSA07.html}, abstract = { Many scenarios, such as network analysis, utility monitoring, and financial applications, generate massive streams of data. These streams consist of millions or billions of simple updates every hour, and must be processed to extract the information described in tiny pieces. This survey provides an introduction the problems of data stream monitoring, and some of the techniques that have been developed over recent years to help mine the data while avoiding drowning in these massive flows of information. In particular, this tutorial introduces the fundamental techniques used to create compact summaries of data streams: sampling, sketching, and other synopsis techniques. It describes how to extract features such as clusters and association rules. Lastly, we see methods to detect when and how the process generating the stream is evolving, indicating some important change has occurred.} }

*This file was generated by
bibtex2html 1.92.*