1 @INPROCEEDINGS{chatnoir,
\r
2 AUTHOR = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch},
\r
3 BOOKTITLE = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)},
\r
5 EDITOR = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson},
\r
10 SITE = {Portland, Oregon},
\r
11 TITLE = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}},
\r
15 @BOOK{text_patterns,
\r
16 author = "{Mike Scott and Christopher Tribble}",
\r
17 title = "{Textual Patterns, Key words and corpus analysis in language education}",
\r
19 publisher = "{John Benjamins Publishing Company}",
\r
25 title = "{Sketch Engine EnTenTen corpus}",
\r
26 howpublished = "\url{http://trac.sketchengine.co.uk/wiki/Corpora/enTenTen}",
\r
30 @book{ManningRaghavanSchuetze08,
\r
31 abstract = {Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective.},
\r
32 added-at = {2012-05-30T10:50:27.000+0200},
\r
33 address = {Cambridge, UK},
\r
34 author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich},
\r
35 biburl = {http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63},
\r
36 file = {Cambridge University Press Product Page:http\://www.cambridge.org/9780521865715:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/0521865719/:URL;Google Books:http\://books.google.de/books?isbn=978-0-521-86571-5:URL},
\r
39 interhash = {b6954037b1d444f4afe4cad883b4d80c},
\r
40 intrahash = {8516d94c1f7aa1e391ddd3ace4caa23b},
\r
41 isbn = {978-0-521-86571-5},
\r
42 keywords = {v1205 book ai information retrieval language processing search xml web},
\r
43 publisher = {Cambridge University Press},
\r
44 timestamp = {2012-05-30T10:50:27.000+0200},
\r
45 title = {Introduction to Information Retrieval},
\r
46 username = {flint63},
\r
51 @INPROCEEDINGS{Knight,
\r
52 author = {Allan Knight and Kevin Almeroth and Bruce Bimber},
\r
53 title = {An Automated System for Plagiarism Detection Using the Internet},
\r
54 booktitle = {Proceedings of World Conference on Educational Multimedia, Hypermedia and Telecommunications, pg. 3619-3625},
\r
59 @INPROCEEDINGS{Kasprzak2008,
\r
60 AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c} and Pavel {\v S}merk",
\r
61 TITLE = "Distributed System for Discovering Similar Documents",
\r
62 SUBTITLE = "From a Relational Database to the Custom-Developed Parallel solution",
\r
63 BOOKTITLE = "ICEIS 2008: Proceedings of the Tenth International Conference on Enterprise Information Systems, Vol. DISI---Databases and Informations Systems Integration",
\r
65 publisher = "INSTICC (Institute for Systems and Technologies of Information, Control and Communication), Setúbal, Portugal",
\r
67 isbn = "978-989-8111-36-4",
\r
71 @INPROCEEDINGS{Kasprzak2009,
\r
72 AUTHOR = "Jan Kasprzak and Michal Brandejs and Jitka Brandejsov\'{a}",
\r
73 TITLE = "Distributed Aspects of the System for Discovering Similar Documents",
\r
74 BOOKTITLE = "ITA 09: Proceedings of the Third International Conference on Internet Technology and Applications",
\r
79 @INPROCEEDINGS{Kasprzak2009a,
\r
80 AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}",
\r
81 TITLE = "Finding Plagiarism by Evaluating Document Similarities",
\r
82 BOOKTITLE = "SEPLN'09: The 25th edition of the Annual Conference of the Spanish Society for Natural Language Processing",
\r
87 @INPROCEEDINGS{Monostori2002,
\r
88 author = {Kriszti\'{a}n Monostori and Raphael A. Finkel and Arkady B. Zaslavsky and G\'{a}bor Hod\'{a}sz and M\'{a}t\'{e} Pataki},
\r
89 title = {Comparison of Overlap Detection Techniques},
\r
90 booktitle = {ICCS '02: Proceedings of the International Conference on Computational Science-Part I},
\r
92 isbn = {3-540-43591-3},
\r
94 publisher = {Springer-Verlag},
\r
95 address = {London, UK},
\r
100 title = "{Czech National Archive of Graduate Theses}",
\r
101 howpublished = "\url{http://theses.cz/}",
\r
102 year = "2008--2011",
\r
107 title = "{Masaryk University Information System}",
\r
108 howpublished = "\url{http://is.muni.cz/}",
\r
109 year = "1999--2011",
\r
113 key = "{Odevzdej.CZ}",
\r
114 title = "{Odevzdej---the system for collecting seminar works}",
\r
115 howpublished = "\url{http://odevzdej.cz/}",
\r
116 year = "2009--2011",
\r
120 @inproceedings{finkel2002,
\r
121 author = {Finkel, Raphael A. and Zaslavsky, Arkady and Monostori, Kriszti\'{a}n and Schmidt, Heinz},
\r
122 title = {Signature extraction for overlap detection in documents},
\r
123 booktitle = {ACSC '02: Proceedings of the twenty-fifth Australasian conference on Computer science},
\r
125 isbn = {0-909925-82-8},
\r
127 location = {Melbourne, Victoria, Australia},
\r
128 publisher = {Australian Computer Society, Inc.},
\r
129 address = {Darlinghurst, Australia},
\r
132 @INPROCEEDINGS{broder97,
\r
133 title={On the resemblance and containment of documents},
\r
134 author={Broder, A.Z.},
\r
135 booktitle={Compression and Complexity of Sequences 1997. Proceedings},
\r
141 keywords={information retrieval, random processes, set theoryRabin fingerprints, World Wide Web, containment, documents, fixed size sample, informal notions, information retrieval, intersection problems, mathematical notions, mathematical properties, random sampling, resemblance, roughly contained, roughly the same},
\r
142 doi={10.1109/SEQUEN.1997.666900},
\r
147 author="{Rivest, R.}",
\r
148 title="{RFC1321: The MD5 Message-Digest Algorithm}",
\r
150 publisher = {RFC Editor},
\r
151 address = {United States},
\r
152 note={\url{http://www.rfc-editor.org/rfc/rfc1321.txt}},
\r
155 @Misc{britannicaplagiarism,
\r
156 author = "Encyclop\ae{}dia Britannica",
\r
157 title = "Plagiarism",
\r
158 howpublished = "retrieved 2009--08--24 from \url{http://www.britannica.com/EBchecked/topic/462640/plagiarism}",
\r
163 author = "iDnes.CZ",
\r
164 title = "Zlínského děkana usvědčili z plagiátorství",
\r
165 howpublished = "retrieved 2009--08--25 from \url{http://zpravy.idnes.cz/studium.asp?c=A080709_085836_studium_bar}",
\r
169 @inproceedings{pomikalek2008,
\r
170 author = "Pomikálek, Jan and Rychlý, Pavel",
\r
171 title = "Detecting Co-Derivative Documents in Large Text Collections",
\r
172 booktitle = "Proceedings of the Sixth International Language Resources and Evaluation (LREC'08)",
\r
175 address = "Marrakech, Morocco",
\r
176 url = "http://www.lrec-conf.org/lrec2008/"
\r
179 @inproceedings{pomikalek2009,
\r
180 author = "Pomikálek, Jan and Rychlý, Pavel and Kilgarriff, Adam",
\r
181 title = "Scaling to Billion-plus Word Corpora",
\r
182 booktitle = "Advances in Computational Linguistics",
\r
185 address = "Mexico",
\r
186 issn = "1870-4069",
\r
187 publisher = "Instituto Politécnico Nacional",
\r
192 title = "Masaryk University: Full-text search",
\r
193 howpublished = "retrieved 2009--08--25 from \url{http://www.muni.cz/general/search}",
\r
198 author = "Alan Cox",
\r
199 title = "{Alan Cox talks about laws... and Linux}",
\r
200 howpublished = "retrieved 2009--08--27 from \url{http://interviews.slashdot.org/article.pl?sid=02/05/20/1314214}",
\r
204 @inproceedings{coderivative,
\r
205 Author = {Bernstein, Y and Zobel, J},
\r
206 Title = {{A Scalable System for Identifying Co-derivative Documents}},
\r
207 Booktitle = {{String Processing and Information Retrieval, Proceedings}},
\r
208 Series = {{Lecture Notes in Computer Science}},
\r
212 Publisher = {{Springer-Verlag Berlin}},
\r
213 Type = {{Proceedings Paper}},
\r
214 Language = {{English}},
\r
215 Affiliation = {{Bernstein, Y (Reprint Author), RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia.
\r
216 RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia.}},
\r
217 ISSN = {{0302-9743}},
\r
218 ISBN = {{3-540-23210-9}},
\r
219 Keywords-Plus = {{COMPRESSION}},
\r
220 Subject-Category = {{Computer Science, Theory \& Methods}},
\r
221 Author-Email = {{ybernste@cs.rmit.edu.au
\r
222 jz@cs.rmit.edu.au}},
\r
223 Number-of-Cited-References = {{17}},
\r
224 Times-Cited = {{2}},
\r
225 Doc-Delivery-Number = {{BBA15}},
\r
226 Unique-ID = {{ISI:000224377200006}},
\r
230 key = "{Turnitin.com}",
\r
231 title = "Turnitin",
\r
232 howpublished = "\url{http://turnitin.com/}, retrieved 2009--08--26",
\r
237 key = "{Copyscape.com}",
\r
238 title = "Copyscape",
\r
239 howpublished = "\url{http://copyscape.com/}, retrieved 2009--08--26",
\r
244 key = "{Doccop.com}",
\r
246 howpublished = "\url{http://doccop.com/}, retrieved 2009--08--26",
\r
250 @MISC{pan09competition,
\r
252 title = "1st International Competition on Plagiarism Detection",
\r
253 howpublished = "\url{http://www.uni-weimar.de/medien/webis/research/workshopseries/pan-09/competition.html}, retrieved 2009--08--26",
\r
257 @INPROCEEDINGS{Brin95copydetection,
\r
258 author = {Sergey Brin and James Davis and Hector Garcia-Molina},
\r
259 title = {Copy Detection Mechanisms for Digital Documents},
\r
260 booktitle = {Proceedings of the ACM SIGMOD Annual Conference},
\r
265 @INPROCEEDINGS{Shivakumar95scam,
\r
266 author = {Narayanan Shivakumar and Hector Garcia-Molina},
\r
267 title = {SCAM: A Copy Detection Mechanism for Digital Documents},
\r
268 booktitle = {Proceedings of the Second Annual Conference on the Theory and Practice of Digital Libraries},
\r
272 @INPROCEEDINGS{Garcia-Molina96dscam:finding,
\r
273 author = {Hector Garcia-Molina and Luis Gravano and Narayanan Shivakumar},
\r
274 title = {dSCAM: Finding Document Copies across Multiple Databases},
\r
275 booktitle = {In Proceedings of the 4th International Conference on Parallel and Distributed Information Systems},
\r
279 @INPROCEEDINGS{LinuxDesktop,
\r
280 author = {Jan Kasprzak},
\r
281 title = "{Desktop a jádro Linuxu}",
\r
282 booktitle = {Proceedings of the XXXI EurOpen.CZ Conference},
\r
283 isbn = "978-80-86583-13-6",
\r
285 publisher = "EurOpen.CZ, Plzeň",
\r
289 @INPROCEEDINGS{Filesystems,
\r
290 author = {Jan Kasprzak},
\r
291 title = "{Co umí souborové systémy}",
\r
292 booktitle = {Proceedings of the XXXII EurOpen.CZ Conference},
\r
293 isbn = "978-80-86583-14-3",
\r
294 pages = {105--118},
\r
295 publisher = "EurOpen.CZ, Plzeň",
\r
299 @INPROCEEDINGS{GitEuropen,
\r
300 author = {Jan Kasprzak},
\r
301 title = "{Git aneb správa verzí trochu jinak}",
\r
302 booktitle = {Proceedings of the XXXIV EurOpen.CZ Conference},
\r
303 isbn = "978-80-86583-16-7",
\r
304 pages = {107--118},
\r
305 publisher = "EurOpen.CZ, Plzeň",
\r
309 @INPROCEEDINGS{Clusters,
\r
310 author = {Jan Kasprzak},
\r
311 title = "{Clusterová řešení pod Linuxem}",
\r
312 booktitle = {SLT 2001: Proceedings of the 2nd Seminar on Linux and \TeX},
\r
313 isbn = "80-7302-009-2",
\r
314 pages = {161--168},
\r
315 publisher = "Konvoj, Brno",
\r
320 AUTHOR = {{Webis at Bauhaus-Universität Weimar} and
\r
321 {NLEL at Universidad Polytécnica de Valencia}},
\r
322 HOWPUBLISHED = {\url{http://www.webis.de/research/corpora}},
\r
323 TITLE = {{PAN Plagiarism Corpus 2009 (PAN-PC-09)}},
\r
325 NOTE = {{Martin Potthast, Andreas Eiselt, Benno Stein,
\r
326 Alberto Barrón-Cedeño, and Paolo Rosso (editors)}}
\r
329 @INPROCEEDINGS{ngram,
\r
330 author = {William B. Cavnar and John M. Trenkle},
\r
331 title = {N-Gram-Based Text Categorization},
\r
332 booktitle = {In Proceedings of SDAIR-94, 3rd Annual Symposium on Document Analysis and Information Retrieval},
\r
337 @inproceedings{intrinsic,
\r
338 author = {Sven Meyer zu Eissen and Benno Stein},
\r
339 booktitle = {ECIR},
\r
340 editor = {Mounia Lalmas and Andy MacFarlane and Stefan M. Rüger and Anastasios Tombros and Theodora Tsikrika and Alexei Yavlinsky},
\r
342 publisher = {Springer},
\r
343 series = {Lecture Notes in Computer Science},
\r
344 title = {Intrinsic Plagiarism Detection.},
\r
345 url = {\url{http://dblp.uni-trier.de/db/conf/ecir/ecir2006.html#EissenS06}},
\r
348 ee = {http://dx.doi.org/10.1007/11735106_66},
\r
349 isbn = {3-540-33347-9},
\r
350 date = {2006-04-03}
\r
353 @INPROCEEDINGS{Heintze96scalabledocument,
\r
354 author = {Nevin Heintze},
\r
355 title = {Scalable Document Fingerprinting},
\r
356 booktitle = {In Proc. USENIX Workshop on Electronic Commerce},
\r
360 @inproceedings{suffixtree,
\r
361 author = {Manber, Udi and Myers, Gene},
\r
362 title = {Suffix arrays: a new method for on-line string searches},
\r
363 booktitle = {SODA '90: Proceedings of the first annual ACM-SIAM symposium on Discrete algorithms},
\r
365 isbn = {0-89871-251-3},
\r
366 pages = {319--327},
\r
367 location = {San Francisco, California, United States},
\r
368 publisher = {Society for Industrial and Applied Mathematics},
\r
369 address = {Philadelphia, PA, USA}
\r
375 AUTHOR = "Andrew Tridgell and Paul Mackerras",
\r
376 TITLE = "The rsync algorithm",
\r
377 DATE = "2004--05--19",
\r
379 INSTITUTION = "Department of Computer Science, FEIT, Australian National university",
\r
381 NOTE = "\url{http://hdl.handle.net/1885/40765}",
\r
386 AUTHOR = "Linus Torvalds et al",
\r
387 TITLE = "{Git---the Fast Version Control System}",
\r
388 HOWPUBLISHED = "\url{http://git-scm.com/}, retrieved 2011--01--12",
\r
393 TITLE = "{The RPM Package Manager}",
\r
394 HOWPUBLISHED = "\url{http://www.rpm.org/}, retrieved 2011-01--12",
\r
399 TITLE = "{DeltaRPM}",
\r
400 HOWPUBLISHED = "\url{ftp://ftp.suse.com/pub/projects/deltarpm/}, retrieved 2011--01--12",
\r
403 @mastersthesis{zazrivec,
\r
404 AUTHOR = "Milan Zázrivec",
\r
405 TITLE = "Algoritmus a implementace software pro tvorbu binárních záplat",
\r
406 SCHOOL = "Faculty of Informatics, Masaryk University",
\r
407 NOTE = "\url{http://is.muni.cz/th/60716/fi_m/}",
\r
412 KEY = "Google Scholar",
\r
413 TITLE = "{Google Scholar}",
\r
414 HOWPUBLISHED = "\url{http://scholar.google.com/}, retrieved 2011--01--12",
\r
419 TITLE = "{American National Standard for Information Systems -- Coded Character Sets -- 7-Bit American National Standard Code for Information Interchange (7-Bit ASCII), ANSI X3.4-1986}",
\r
420 DATE="1986--03--26",
\r
422 INSTITUTION = "American National Standards Institute, Inc.",
\r
426 AUTHOR = "The Unicode Consortium",
\r
427 TITLE = "{The Unicode Standard -- Version 4.0}",
\r
428 PUBLISHER = "Addison-Wesley, Boston, MA",
\r
430 ISBN = "0--321--18578--1",
\r
431 NOTE = "\url{http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode_4_0_0}",
\r
436 TITLE = "{RFC 3629: UTF-8, a transformation format of ISO 10646}",
\r
437 AUTHOR = "F. Yergeau",
\r
439 NOTE = "\url{http://tools.ietf.org/html/rfc3629}",
\r
442 @MISC{thesisproposal,
\r
443 TITLE = "{Systems for Discovering Similar Documents}",
\r
444 AUTHOR = "Jan Kasprzak",
\r
445 INSTITUTION = "Faculty of Informatics, Masaryk University",
\r
447 NOTE = "Ph.D. thesis proposal, \url{http://is.muni.cz/th/1885/fi_r/}",
\r
450 @inproceedings{Kasprzak2010,
\r
451 title={Improving the reliability of the plagiarism detection system},
\r
452 author={Kasprzak, J. and Brandejs, M.},
\r
453 booktitle={Notebook Papers of CLEF 2010 LABs and Workshops},
\r
455 organization={Citeseer}
\r
458 @article{stamatatos2011plagiarism,
\r
459 title={Plagiarism detection using stopword n-grams},
\r
460 author={Stamatatos, E.},
\r
461 journal={Journal of the American Society for Information Science and Technology},
\r
463 publisher={Wiley Online Library}
\r
465 @inproceedings{pan09stamatatos,
\r
466 author = {Efstathios Stamatatos},
\r
468 title = {Intrinsic Plagiarism Detection Using Character n-gram Profiles},
\r
469 booktitle = {Proceedings of the SEPLN'09 Workshop on Uncovering Plagiarism, Authorship and Social Software Misuse},
\r
471 location = {San Sebastian (Donostia), Spain},
\r
472 issn = {1613--0073},
\r
476 @article{zipf1935psycho,
\r
477 title={The psycho-biology of language.},
\r
478 author={Zipf, G.K.},
\r
480 publisher={Houghton, Mifflin}
\r
483 @INPROCEEDINGS{potthastframework,
\r
484 TITLE = {{An Evaluation Framework for Plagiarism Detection}
\r
486 AUTHOR = {Martin Potthast and Benno Stein and Alberot Barr{\'o}n-Cede{\~n}o and Paolo Rosso},
\r
487 BOOKTITLE = {Proceedings of the 23rd International Conference on Computational Linguistics (COLING 2010)},
\r
490 ADDRESS = {Beijing, China},
\r
491 PUBLISHER = {Association for Computational Linguistics},
\r