{"$schema": "https://c3voc.de/schedule/schema.json", "generator": {"name": "pretalx", "version": "2026.1.1"}, "schedule": {"url": "https://pretalx.com/bbuzz22/schedule/", "version": "0.15", "base_url": "https://pretalx.com", "conference": {"acronym": "bbuzz22", "title": "Berlin Buzzwords 2022", "start": "2022-06-12", "end": "2022-06-14", "daysCount": 3, "timeslot_duration": "00:05", "time_zone_name": "Europe/Berlin", "colors": {"primary": "#3D3182"}, "rooms": [{"name": "Kesselhaus", "slug": "1406-kesselhaus", "guid": "b40dedd4-1259-531b-9007-e67b1c8db4c2", "description": null, "capacity": 250}, {"name": "Palais Atelier", "slug": "1407-palais-atelier", "guid": "ec1ed9ac-bf3a-5c1e-a30f-8f91306dfb01", "description": null, "capacity": 120}, {"name": "Maschinenhaus", "slug": "1408-maschinenhaus", "guid": "4f1054f5-30fc-5c91-87f6-ca46581fa54d", "description": null, "capacity": 120}, {"name": "Frannz Salon", "slug": "1409-frannz-salon", "guid": "0c3440ff-7857-546f-bf17-f9950cb59071", "description": null, "capacity": 80}], "tracks": [{"name": "Search", "slug": "2729-search", "color": "#009CB4"}, {"name": "Store", "slug": "2730-store", "color": "#AAB500"}, {"name": "Scale", "slug": "2731-scale", "color": "#EDB400"}, {"name": "Stream", "slug": "2732-stream", "color": "#D97585"}], "days": [{"index": 1, "date": "2022-06-12", "day_start": "2022-06-12T04:00:00+02:00", "day_end": "2022-06-13T03:59:00+02:00", "rooms": {"Palais Atelier": [{"guid": "4a2d7007-bc2c-52f6-bb2f-c8e1fc1b255b", "code": "BGLZAL", "id": 18604, "logo": null, "date": "2022-06-12T15:00:00+02:00", "start": "15:00", "duration": "03:00", "room": "Palais Atelier", "slug": "bbuzz22-18604-barcamp", "url": "https://pretalx.com/bbuzz22/talk/BGLZAL/", "title": "Barcamp", "subtitle": "", "track": null, "type": "Barcamp", "language": "en", "abstract": "Barcamps are informal sessions, a kind of \"un-conference\", with a schedule decided on the day. It is all driven by the interests and expertise of those who attend so each one is different, but ours are always great!\n\nAlthough the barcamp doesn't have a strict schedule, it won't be completely devoid of structure! #bbuzz barcamps are dynamic events, focused on the overall Berlin Buzzwords topics, tackling the same challenges but in a different format. At the barcamp each session runs for 30 minutes giving enough time to get into the meat of a topic, but without a chance of anyone getting bored. These are participatory sessions and more inclusive than regular conference talks, with everyone taking part. You can help by leading the session, by giving some insights, by asking some great questions, or maybe just with your enthusiasm.\n\nThe barcamp will be coordinated and moderated by Nick Burch.\n\nRegistration starts from 2:30pm", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "97HYST", "name": "Nick Burch", "avatar": "https://pretalx.com/media/avatars/97HYST_759PqjE.webp", "biography": "Nick is heavily involved in a number of Apache projects, such as Tika and POI, while having the fortune to know many of the people involved in the Apache Big Data and Search space! When not helping out with Apache things, Nick works as the Director of Engineering at FLEC, where he leads a team making heavy use of Open Source technologies. When not helping improve the logistics industry, he is often to be found attending or organising BarCamps, Geek Nights, or other such fun events dedicated to sharing what's great and new!", "public_name": "Nick Burch", "guid": "e7f4faed-180f-58b5-8048-70be86cb80f6", "url": "https://pretalx.com/bbuzz22/speaker/97HYST/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/BGLZAL/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/BGLZAL/", "attachments": []}]}}, {"index": 2, "date": "2022-06-13", "day_start": "2022-06-13T04:00:00+02:00", "day_end": "2022-06-14T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "0afd234a-1e03-5a38-ad1f-90ba214b4e9f", "code": "AKGDUD", "id": 18322, "logo": "https://pretalx.com/media/bbuzz22/submissions/AKGDUD/01._Coath_Fiona_d68re5m.png", "date": "2022-06-13T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-18322-meet-the-people-fighting-surveillance-capitalism", "url": "https://pretalx.com/bbuzz22/talk/AKGDUD/", "title": "Meet the people fighting surveillance capitalism", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "What does it mean for democracy when we live in a world where hyper personalised misinformation and bot armies manipulate public opinion? This propaganda is fueled by social media companies, who's business depends on growing their user base, increasing engagement and improving targeting. Just getting visibility on what users are being shown is challenging, even with current EU regulations. As is often the case, users in the global south are most vulnerable, without robust regulation and with fewer moderators per user for many languages.\n\nAs technologists we are well positioned to understand this threat. How might we leverage this to create positive change? By exploring examples of people who blew whistles, enabled regulation, or taught others how to stay safe online, we can take back hope and get inspired to fight back against surveillance capitalism.\n\n---\n#### Get your ticket now!\nRegister for Berlin Buzzwords in our [ticket shop](https://2022.berlinbuzzwords.de/registration)! We also have online tickets and reduced tickets for students available and you can find more information about our Diversity Ticket Initiative [here](https://2022.berlinbuzzwords.de/diversity-inclusivity)!", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "DZSDUX", "name": "Fiona Coath", "avatar": "https://pretalx.com/media/avatars/DZSDUX_EOiWfUr.webp", "biography": "Fiona is a lead software developer and consultant currently working at Thoughtworks. Her specialty is being a generalist. She enjoys problem solving across a range of technologies including natural language processing and data analytics. Her passion for social justice and equality keeps inspiring her to investigate the relationships between technology and society. When she isn\u2019t solving challenging problems, she enjoys craft, exploring the world and the colour purple.", "public_name": "Fiona Coath", "guid": "ca7ce181-3b4a-5d04-b9f2-ec192a921393", "url": "https://pretalx.com/bbuzz22/speaker/DZSDUX/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/AKGDUD/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/AKGDUD/", "attachments": []}, {"guid": "e2f920dc-f94c-5753-ab05-cc4c216abec8", "code": "ZUKUJ3", "id": 16062, "logo": "https://pretalx.com/media/bbuzz22/submissions/ZUKUJ3/02._Benton_Will_-_Watson_Sophie_wmFV16w.png", "date": "2022-06-13T11:00:00+02:00", "start": "11:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16062-luxuries-necessities-and-the-challenges-that-remain-some-experiences-with-accelerated-data-science", "url": "https://pretalx.com/bbuzz22/talk/ZUKUJ3/", "title": "Luxuries, necessities, and the challenges that remain: some experiences with accelerated data science", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "The promise of accelerated computing presents an interesting paradox: while no one complains when new compute infrastructure is dramatically faster than its predecessor, few people realize how much they\u2019d benefit from acceleration until they have it.  It is perhaps unsurprising that a data scientist\u2019s daily work consists of tasks that they can accomplish with their available computing resources, but simply running our existing work faster makes acceleration into a mere luxury. For accelerated computing to fulfill its promise, we need it to transform our work by enabling us to do new things that wouldn\u2019t have been feasible without it. In this talk, we\u2019ll discuss our experiences accelerating data science with specialized hardware and by scaling out on clusters.  We\u2019ll present examples of previously-impossible techniques becoming feasible, of the pleasant luxury of improved performance, and of the data science tasks that aren\u2019t likely to justify additional hardware or implementation effort.  You\u2019ll leave this talk with a better understanding of how accelerated and scale-out computing can fit into your data science practice, a catalog of techniques that are still well served by standard hardware, and some actionable advice for how to take advantage of parallel and distributed computing across your workflow.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "DGEEBH", "name": "William Benton", "avatar": "https://pretalx.com/media/avatars/DGEEBH_Jg68fQ5.webp", "biography": "William Benton is passionate about making it easier for machine learning practitioners to benefit from advanced infrastructure and making it possible for organizations to manage machine learning systems. His recent roles have included defining product strategy and professional services offerings related to data science and machine learning, leading teams of data scientists and engineers, and contributing to many open source communities related to data, ML, and distributed systems. Will was an early advocate of building machine learning systems on Kubernetes and developed and popularized the \u201cintelligent applications\u201d idiom for machine learning systems in the cloud. He has also conducted research and development related to static program analysis, language runtimes, cluster configuration management, and music technology.", "public_name": "William Benton", "guid": "79c06759-4b7d-5b51-893e-90e4bacc578a", "url": "https://pretalx.com/bbuzz22/speaker/DGEEBH/"}, {"code": "YYBKHB", "name": "Sophie Watson", "avatar": "https://pretalx.com/media/avatars/YYBKHB_rELxtC6.webp", "biography": "Sophie is a Technical Marketing Manager at NVIDIA, where she strives to make Data Scientists' every day lives easier.  Sophie has focused on applying her data science and statistics skills to solving business problems and informing next-generation infrastructure for intelligent application development.", "public_name": "Sophie Watson", "guid": "1d7b4b3e-1751-54ab-8d21-38f11f4104d7", "url": "https://pretalx.com/bbuzz22/speaker/YYBKHB/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/ZUKUJ3/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/ZUKUJ3/", "attachments": []}, {"guid": "a18bd4da-a29e-5744-9985-2d1886e7f0f9", "code": "FHEHAL", "id": 16025, "logo": "https://pretalx.com/media/bbuzz22/submissions/FHEHAL/06._LeDem_Julien_fHOqqQx.png", "date": "2022-06-13T11:50:00+02:00", "start": "11:50", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16025-cross-platform-data-lineage-with-openlineage", "url": "https://pretalx.com/bbuzz22/talk/FHEHAL/", "title": "Cross-Platform Data Lineage with OpenLineage", "subtitle": "", "track": "Store", "type": "Talk", "language": "en", "abstract": "There are more data tools available than ever before, and it's easier to build a pipeline than it's ever been. This has resulted in an explosion of innovation, but it also means that data within today's organizations has become increasingly distributed. It can't be contained within a single brain, a single team, or a single platform.\n\nData lineage can help by tracing the relationships between datasets and providing a map of your entire data universe. OpenLineage provides a standard for lineage collection that spans multiple platforms, including Apache Airflow, Apache Spark, Flink, and dbt. This empowers teams to diagnose and address widespread data quality and efficiency issues in real time. \n\nIn this session, Julien Le Dem from Datakin will show how to trace data lineage across Apache Spark and Apache Airflow. He will walk through the OpenLineage architecture and provide a live demo of a running pipeline with real-time data lineage.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "F8AVTQ", "name": "Julien Le Dem", "avatar": "https://pretalx.com/media/avatars/F8AVTQ_uASLpI3.webp", "biography": "Julien Le Dem is the Chief Architect of Astronomer and Co-Founder of Datakin. He co-created Apache Parquet and is involved in several open source projects including OpenLineage, Marquez (LFAI&Data), Apache Arrow, and Apache Iceberg. Previously, he was a senior principal at Wework; principal architect at Dremio; tech lead for Twitter\u2019s data processing tools, where he also obtained a two-character Twitter handle (@J_); and a principal engineer and tech lead working on content platforms at Yahoo, where he received his Hadoop initiation. His French accent makes his talks particularly attractive.", "public_name": "Julien Le Dem", "guid": "1b1d54d9-ea4e-5e66-85b7-8fbdb60e8a4e", "url": "https://pretalx.com/bbuzz22/speaker/F8AVTQ/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/FHEHAL/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/FHEHAL/", "attachments": []}, {"guid": "3c69d4ae-36f6-5ad5-8c92-83aee1e0fbf1", "code": "NEQRBP", "id": 16133, "logo": "https://pretalx.com/media/bbuzz22/submissions/NEQRBP/10._Lutz_Yunus_-_Schuett_Andrea_ieYh1df.png", "date": "2022-06-13T12:40:00+02:00", "start": "12:40", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz22-16133-offline-ranking-validation-predicting-a-b-test-results", "url": "https://pretalx.com/bbuzz22/talk/NEQRBP/", "title": "Offline Ranking Validation - Predicting A/B Test Results", "subtitle": "", "track": "Search", "type": "Short Talk", "language": "en", "abstract": "Implementing a machine learning model for ranking in an ecommerce search requires a well-designed approach to how the target metric is defined. In our team we validate our target metrics with online tests on live traffic. This requires both long preparation times and long enough runtimes to yield valid results. Having to choose only a few candidates for the next A/B test is hard and slows us down significantly. So what if we had a way to evaluate the candidates beforehand to make a more informed decision?\n\nWe came up with an approach to predict how a certain ranking will perform in an onsite test. We leverage historic user interaction data from search events and try to correlate them with ranking metrics like NDCG. This gives us insights on how well the ranking meets the user intent. This is not meant to be a replacement for a real A/B test, but allows us to narrow down the field of candidates to a manageable number. In this talk we will share our approach to offline ranking validation and how it performed in practice.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "NBCMQN", "name": "Andrea Schuett", "avatar": "https://pretalx.com/media/avatars/NBCMQN_WR4mmet.webp", "biography": "Andrea Sch\u00fctt is a Data Scientist at OTTO\u2019s search team. Currently she is working on bringing OTTO\u2019s first learning to rank model into production. She has a degree in electrical engineering with a focus on automation.", "public_name": "Andrea Schuett", "guid": "f1d64d91-c948-53d6-b5a6-246d23304d65", "url": "https://pretalx.com/bbuzz22/speaker/NBCMQN/"}, {"code": "89FN7S", "name": "Yunus Lutz", "avatar": "https://pretalx.com/media/avatars/89FN7S_eiqL0KX.webp", "biography": "Yunus is a Data Scientist at Otto, where he works on bringing Otto\u2019s first learning to rank model into production. Prior to joining Otto, he worked as a Data Scientist and Engineer at Deloitte, where he developed pragmatic and data-driven solutions for various clients. He holds a M.Sc. in Quantitative Economics with a focus on statistics and time series analysis.", "public_name": "Yunus Lutz", "guid": "9718b206-357d-53e3-8a5e-c8023a499400", "url": "https://pretalx.com/bbuzz22/speaker/89FN7S/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/NEQRBP/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/NEQRBP/", "attachments": []}, {"guid": "11a6af9a-4879-5d2d-b0d2-4ac6c55dd7c9", "code": "LUBLXC", "id": 16052, "logo": "https://pretalx.com/media/bbuzz22/submissions/LUBLXC/31._Barki_Noaa_CMJwJPw.png", "date": "2022-06-13T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16052-what-we-learned-from-reading-100-kubernetes-post-mortems", "url": "https://pretalx.com/bbuzz22/talk/LUBLXC/", "title": "What we learned from reading 100+ Kubernetes Post-Mortems", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "When building our Kubernetes-native product, we wanted to find the most common sources of failures, anti-patterns and root causes for Kubernetes outages, so we got to work. We rolled up our sleeves and read 100+ Kubernetes post-mortems. This is what we discovered.\n\nA smart person learns from their own mistakes, but a truly wise person learns from the mistakes of others.\n\nWhen launching our product, we wanted to learn as much as possible about typical pains in our ecosystem, and did so by reviewing many post-mortems (100+!) to discover the recurring patterns, anti-patterns, and root causes of typical outages in Kubernetes-based systems.\n\nIn this talk we have aggregated for you the insights we gathered, and in particular will review the most obvious DON\u2019Ts and some less obvious ones, that may help you prevent your next production outage by learning from others\u2019 real world (horror) stories.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "V8DN3L", "name": "Noaa Barki", "avatar": "https://pretalx.com/media/avatars/V8DN3L_IqLoa2Q.webp", "biography": "Noaa is a full-stack developer, community manager, and tech writer who wishes to encourage developers to deepen the decisions we make during the development processes, research about the technologies we use and share our knowledge. She started her journey in the 8200 Unit of the IDF Intelligence forces where Noaa took her first steps in software development. In the last 4 years, her work has mainly included Angular, .NET, VanillaJS, and Typescript. She currently develops in React, NodeJS and Golang.", "public_name": "Noaa Barki", "guid": "ee4d3ffa-1236-536f-9a1f-0b55967308a5", "url": "https://pretalx.com/bbuzz22/speaker/V8DN3L/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/LUBLXC/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/LUBLXC/", "attachments": []}, {"guid": "13cc2f10-c3b6-55ce-81ca-9106f113313e", "code": "ZFZJAK", "id": 16126, "logo": "https://pretalx.com/media/bbuzz22/submissions/ZFZJAK/17._Godbillot_Pascal_-_Tapi_Nzali_Mike_aAPDfPS.png", "date": "2022-06-13T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16126-reproducible-and-shareable-notebooks-across-a-data-science-team", "url": "https://pretalx.com/bbuzz22/talk/ZFZJAK/", "title": "Reproducible and shareable notebooks across a data science team", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "At CybelAngel we scan the internet looking for sensitive data leaks belonging to our clients. \nAs the volume of alerts could count billions of samples, we use machine learning to throw away as much noise as possible to reduce the analysts' workload.\n\nWe are a growing team of data scientists and a machine learning engineer, planning to double in size. Each of us contributes to projects and we use Notebooks before code industrialisation. As for many other data science teams, a lot of effort and valuable work is encapsulated in a format that is tricky to share, hardly reproducible and simply not built for production purposes. During the talk, we will present what we did to overcome some of these issues and our feedback about notebook versioning and implementation in Google Cloud Platform using open JupyterHub and Jupytext.\n\nThis talk is addressed to a technical audience but all roles gravitating around a data team are welcome to grasp the challenges of the interaction of data science within the organisation.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "RRVY3E", "name": "Mike Tapi Nzali", "avatar": "https://pretalx.com/media/avatars/RRVY3E_Ke1VtMV.webp", "biography": "I am a machine learning engineer at CybelAngel with a PhD in Computer Science. I like to work in a startup environment, also leading the development of machine learning products from idea to production. I am interested in cutting-edge technology, sharing knowledge and industrialization of Machine Learning.", "public_name": "Mike Tapi Nzali", "guid": "ea62ebe5-ac76-5ccf-8ce0-f03ebe436266", "url": "https://pretalx.com/bbuzz22/speaker/RRVY3E/"}, {"code": "NKZYYG", "name": "Pascal Godbillot", "avatar": "https://pretalx.com/media/avatars/NKZYYG_RKxJC1P.webp", "biography": "Pascal is a Data Scientist at CybelAngel, Paris. He is focusing on building robust and efficient machine learning models to identify all kinds of digital threats. He also has a strong interest in various subjects related to Machine Learning Operations (MLOps). He is eager to solve the technological challenges of tomorrow in the AI field where innovation and knowledge sharing are paramount.", "public_name": "Pascal Godbillot", "guid": "ebbd782e-df7d-5b89-86ab-823c3ab383ea", "url": "https://pretalx.com/bbuzz22/speaker/NKZYYG/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/ZFZJAK/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/ZFZJAK/", "attachments": []}, {"guid": "93e47db3-0a23-5733-b8be-f91fc651abb7", "code": "YDNXAK", "id": 15347, "logo": "https://pretalx.com/media/bbuzz22/submissions/YDNXAK/20._Benedetti_Alessandro_JYeGJ6Z.png", "date": "2022-06-13T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-15347-neural-search-comes-to-apache-solr-approximate-nearest-neighbor-bert-and-more-buzzwords", "url": "https://pretalx.com/bbuzz22/talk/YDNXAK/", "title": "Neural Search Comes to Apache Solr: Approximate Nearest Neighbor, BERT and More (Buzzwords)!", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "The first integrations of machine learning techniques with search allowed to improve the ranking of your search results (Learning To Rank) - but one limitation has always been that documents had to contain the keywords that the user typed in the search box in order to be retrieved.\nFor example, the query \u201ctiger\u201d won\u2019t retrieve documents containing only the terms \u201cpanthera tigris\u201d.\nThis is called the vocabulary mismatch problem and over the years it has been mitigated through query and document expansion approaches.\nNeural search is an Artificial Intelligence technique that allows a search engine to reach those documents that are semantically similar to the user\u2019s query without necessarily containing those terms; it avoids the need for long lists of synonyms by automatically learning the similarity of terms and sentences in your collection through the utilisation of deep neural networks and numerical vector representation.\nThis talk explores the first Apache Solr official contribution about this topic, available from Apache Solr 9.0.\nDuring the talk we will give an overview of neural search (Don\u2019t worry - we will keep it simple!): we will describe vector representations for queries and documents, and how Approximate K-Nearest Neighbor (KNN) vector search works. \nWe will show how neural search can be used along with deep learning techniques (e.g, BERT) or directly on vector data, and how we implemented this feature in Apache Solr, giving usage examples!\nJoin us as we explore this new exciting Apache Solr feature and learn how you can leverage it to improve your search experience!\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "GJ3PTP", "name": "Alessandro Benedetti", "avatar": "https://pretalx.com/media/avatars/GJ3PTP_7fNBvIJ.webp", "biography": "Alessandro Benedetti is director and R&D Software Engineer at Sease Ltd.\nHis focus is on information retrieval, information extraction, natural language processing, and machine learning.\nAt Sease Alessandro is working on Search/Machine learning R&D and consultancies.\nWhen he isn't on clients' projects, he is actively contributing to the open-source community and presenting the applications of leading-edge techniques in real world scenarios at meet-ups and conferences such as ECIR, the Lucene/Solr Revolution, ApacheCon, Haystack, FOSDEM, and Open Source Summit.", "public_name": "Alessandro Benedetti", "guid": "3c1785eb-a159-54b7-ae0d-3a4b4ff96067", "url": "https://pretalx.com/bbuzz22/speaker/GJ3PTP/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/YDNXAK/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/YDNXAK/", "attachments": []}, {"guid": "b81c74fd-f661-53b5-9738-64695f51386a", "code": "MF3HUM", "id": 19822, "logo": null, "date": "2022-06-13T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz22-19822-a-smooth-ride-online-car-buying-and-selling-at-mobile-de", "url": "https://pretalx.com/bbuzz22/talk/MF3HUM/", "title": "A smooth ride: Online car buying and selling at mobile.de", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Mobile.de is Germany's largest online vehicle marketplace. Under the hood, there are more data products and machine learning solutions than one could imagine when thinking about an online classifieds platform. In this talk, we will present the main decision-making checkpoints in the car buying and selling scenarios, and which are mobile.de data products  support users in their journey. Our talk will present an overview of all data topics and provide a deeper look on a few of them.\n\nThis talk is sponsored by mobile.de", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "RAT8VA", "name": "Ricardo Kawase", "avatar": "https://pretalx.com/media/avatars/RAT8VA_X0ovyQ3.webp", "biography": "Dr. Ricardo Kawase leads a team of Data Scientists and Engineers mobile.de GmbH/Adevinta, the leading online automotive marketplace in Germany. His team is responsible for topics such fraud fighting/prevention, price prediction, user profiling, customer behavior prediction and personalization. He holds a Ph.D in Computer Science (Doctor rerum naturalium, Dr. rer. nat.) from the Gottfried Wilhelm Leibniz Universit\u00e4t Hannover, Germany. Before joining mobile.de he worked as a researcher for over 7 years at the L3S Research center in Hannover on several topics such as data mining, information retrieval, semantic web, e-learning, social networks, crowdsourcing and Web science in general. He has written, collaborated, and published over 60 peer reviewed academic articles, and serves as a reviewer in several conferences and journals.", "public_name": "Ricardo Kawase", "guid": "d1936a71-735f-5855-9007-58a9d63be9b9", "url": "https://pretalx.com/bbuzz22/speaker/RAT8VA/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/MF3HUM/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/MF3HUM/", "attachments": []}, {"guid": "d9282eb8-06d5-57b6-a8e7-f27a7acff9b4", "code": "YEHRTE", "id": 16139, "logo": "https://pretalx.com/media/bbuzz22/submissions/YEHRTE/27._Solbakken_Lester_XfjSDmV.png", "date": "2022-06-13T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16139-hybrid-search-sum-of-its-parts", "url": "https://pretalx.com/bbuzz22/talk/YEHRTE/", "title": "Hybrid search > sum of its parts?", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Over the decades, information retrieval has been dominated by classical methods such as BM25. These lexical models are simple and effective yet vulnerable to vocabulary mismatch. With the introduction of pre-trained language models such as BERT and its relatives, deep retrieval models have achieved superior performance with their strong ability to capture semantic relationships. The downside is that training these deep models is computationally expensive, and suitable datasets are not always available for fine-tuning toward the target domain.\n\nWhile deep retrieval models work best on domains close to what they have been trained on, lexical models are comparatively robust across datasets and domains. This suggests that lexical and deep models can complement each other, retrieving different sets of relevant results. But how can these results effectively be combined? And can we learn something from language models to learn new indexing methods? This talk will delve into both these approaches and exemplify when they work well and not so well. We will take a closer look at different strategies to combine them to get the best of both, even in zero-shot cases where we don't have enough data to fine-tune the deep model.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "89XRTP", "name": "Lester Solbakken", "avatar": "https://pretalx.com/media/avatars/89XRTP_Njhd9xK.webp", "biography": "Principle Software Engineer at Verizon Media (previously Yahoo) on the Vespa platform, the open big data serving engine (vespa.ai). Focus areas are machine learning engineering with emphasis on serving and search system ranking. Previously pursued a PhD within Artificial Intelligence and Machine Learning. Main research topics were neural networks, exploratory data analysis and self-organizing systems.", "public_name": "Lester Solbakken", "guid": "3d4fa49e-0361-56dd-b6e9-03e74a6dc712", "url": "https://pretalx.com/bbuzz22/speaker/89XRTP/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/YEHRTE/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/YEHRTE/", "attachments": []}], "Palais Atelier": [{"guid": "7ed717bd-b7db-5dc5-8449-c0b2faed6d78", "code": "XSJXRT", "id": 15175, "logo": "https://pretalx.com/media/bbuzz22/submissions/XSJXRT/05._Kutsenko_Olena_2BBKR6c.png", "date": "2022-06-13T11:00:00+02:00", "start": "11:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-15175-apache-kafka-simply-explained", "url": "https://pretalx.com/bbuzz22/talk/XSJXRT/", "title": "Apache Kafka simply explained", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "You\u2019re curious about what Apache Kafka does and how it works, but between the terminology and explanations that seem to start at a complex level, it's been difficult to embark. This session is different. We'll talk about what Kafka is, what it does and how it works in simple terms with easy to understand and funny examples that you can share later at a dinner table with your family.\n\nThis session is for curious minds, who might have never worked with distributed streaming systems before, or are beginners to event streaming applications.\n\nBut let simplicity not deceive you - by the end of the session you\u2019ll be equipped to create your own Apache Kafka event stream!", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "LLBXBT", "name": "Olena Kutsenko", "avatar": "https://pretalx.com/media/avatars/LLBXBT_Q3Gez1v.webp", "biography": "Olena is a software engineer and a developer advocate currently working at Aiven. She is passionate about open source, data, sustainable software development and team work. Her knowledge is shaped by expertise she acquired working in such companies as Nokia, HERE Technologies and AWS; and from the countries she was lucky to live in - Ukraine, Sweden, Spain and Germany.", "public_name": "Olena Kutsenko", "guid": "aa43d05d-1405-5f91-a0a4-9845c7b38f2c", "url": "https://pretalx.com/bbuzz22/speaker/LLBXBT/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/XSJXRT/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/XSJXRT/", "attachments": []}, {"guid": "2df78dd7-45e6-5b2c-8144-516dbb3d2bb2", "code": "BRRB7M", "id": 16172, "logo": "https://pretalx.com/media/bbuzz22/submissions/BRRB7M/07._Paponaud_Aline_4FFKbLd.png", "date": "2022-06-13T11:50:00+02:00", "start": "11:50", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-16172-scaling-an-online-search-engine-to-thousands-of-physical-stores", "url": "https://pretalx.com/bbuzz22/talk/BRRB7M/", "title": "Scaling an online search engine to thousands of physical stores", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "An online e-commerce search engine is easy to put in place. Scaling it to serve millions of users, adding a marketplace to provide thousands of products, supporting multiple offers, prices and stocks on the same product are additional challenges more difficult to address. And what if, in addition, you mix your online search engine with the activity of thousands of physical stores?\n\nIn this talk we explain how we addressed all these challenges in the context of the largest retail group and online grocery store in France. The constraint of multiple physical stores backed by the online search engine introduces additional challenges that we emphasize and address in detail. Our point of view, as we explain the challenges and solutions, is both technical and functional.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "H8BKZC", "name": "Aline Paponaud", "avatar": "https://pretalx.com/media/avatars/H8BKZC_FBSJttu.webp", "biography": "CTO of [Adelean](https://adelean.com), working with search and providing consulting services and expertise around Elasticsearch, Lucene and Solr. She brings her energy to leveraging search engines, as they become more and more essential in every domain.", "public_name": "Aline Paponaud", "guid": "4d1906bf-78ce-5198-87fb-eda9bf5ed4d8", "url": "https://pretalx.com/bbuzz22/speaker/H8BKZC/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/BRRB7M/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/BRRB7M/", "attachments": []}, {"guid": "d1341fd4-efb6-5793-b46f-f923fb533578", "code": "MP9BBT", "id": 16023, "logo": "https://pretalx.com/media/bbuzz22/submissions/MP9BBT/11._Fisher_Eli_bT9tGoP.png", "date": "2022-06-13T12:40:00+02:00", "start": "12:40", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz22-16023-why-a-search-engine-makes-a-great-log-analytics-solution", "url": "https://pretalx.com/bbuzz22/talk/MP9BBT/", "title": "Why a Search Engine Makes a Great Log Analytics Solution", "subtitle": "", "track": "Search", "type": "Short Talk", "language": "en", "abstract": "Search Engine technologies, like OpenSearch, have continued to grow in popularity for a number of different use cases. Features like full-text search, fast ingestion, scalability, faceting, and extensible plugin frameworks were often enhanced with the aim to improve the search use case. However, the side effect of these improvements provided much of the foundation that led people to adopting these technologies for other uses like click stream analytics, log analytics, security analytics, and more.\n\nIn this talk we will explore how features that started as search enhancements opened the door for new use cases and why we continue to see affinity between search engines and broader analytics workloads.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "JVBEUB", "name": "Eli Fisher", "avatar": "https://pretalx.com/media/avatars/JVBEUB_KHc8F2u.webp", "biography": "Eli Fisher has worked on databases and analytics for Microsoft, Google, and Amazon. He also co-founded the IoT-based digital signage company, Raydiant. He now leads the Product Management team at AWS that is working on the OpenSearch Project.", "public_name": "Eli Fisher", "guid": "d97169f5-cdb9-5c61-899f-d2ed3b0eb6eb", "url": "https://pretalx.com/bbuzz22/speaker/JVBEUB/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/MP9BBT/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/MP9BBT/", "attachments": []}, {"guid": "9a30a6d1-de66-5c02-bd04-6c2363a42298", "code": "7BJSBH", "id": 16186, "logo": "https://pretalx.com/media/bbuzz22/submissions/7BJSBH/13._Grygleski_Mary_Tkwbept.png", "date": "2022-06-13T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-16186-benefits-of-mqtt-for-iot-messaging-and-beyond", "url": "https://pretalx.com/bbuzz22/talk/7BJSBH/", "title": "Benefits of MQTT for IoT Messaging and Beyond", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "IoT applications run on IoT devices and can be created to be specific to almost every industry and vertical, from small devices to large ones, including healthcare, industrial automation, smart homes and buildings, automotive, and wearable technology. The possibilities are limitless. Increasingly, IoT applications are using AI and machine learning to add intelligence to devices. Among all of the variables in the IoT ecosystem, one common theme is the need to be able to handle the constrained operating environment, such as unreliable network connectivity, limited bandwidth, low battery power, and so on. We will take a look into the MQTT protocol, how it has evolved from its early days which was intended for the connection of oil pipelines via satellite, to now the ever-increasing demand in IoT and M2M applications, to how this protocol will evolve to meet the modern needs especially in the current cloud computing era. We will study a few outstanding MQTT libraries that are available in the market, such as the Java-based HiveMQ, and open source libraries such as Eclipse Mosquitto.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "MCMEAK", "name": "Mary Grygleski", "avatar": "https://pretalx.com/media/avatars/MCMEAK_4qMBpRs.webp", "biography": "Mary is a Java Champion and a passionate Senior/Lead Developer Advocate at HiveMQ, the leading provider of MQTT IoT & IIoT messaging and enterprise-grade cloud-native software. She spent 3.5 years as a very effective advocate at IBM, focusing on Java, Jakarta EE, OpenJ9, Open Source, Cloud, and Distributed Systems.  She transitioned from Unix/C to Java around 2000 and has never looked back since then.  She considers herself a polyglot and loves to continue learning new and better ways to solve real-life problems. She is an active tech community builder outside of her day job, and currently the President of the Chicago Java Users Group (CJUG), as well as a co-organizer for several IBM-sponsored meetup groups in the Greater Chicago area.", "public_name": "Mary Grygleski", "guid": "8179dd04-8b13-5e8e-856e-b4c99c78cdea", "url": "https://pretalx.com/bbuzz22/speaker/MCMEAK/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/7BJSBH/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/7BJSBH/", "attachments": []}, {"guid": "7eb2814f-8977-5250-b69b-5bea4d10acce", "code": "YKABW8", "id": 15702, "logo": "https://pretalx.com/media/bbuzz22/submissions/YKABW8/16._Lord_Joel_BcmzBnK.png", "date": "2022-06-13T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-15702-help-i-need-to-unsqlize-my-application", "url": "https://pretalx.com/bbuzz22/talk/YKABW8/", "title": "Help! I Need To UnSQLize My Application", "subtitle": "", "track": "Store", "type": "Talk", "language": "en", "abstract": "More and more people are moving from old-school relational databases to a variant of NoSQL. If starting a green-field project with a document database is easy, it can be a different story when migrating from one to the other. Simply porting SQL tables to a collection might cause you more harm than good. In this talk, the attendees will learn about the basic concepts of document databases, such as documents and collections. They will then learn about some of the standard data schemas available. Finally, the speaker will show real-life examples of data migration and how they can be applied to adopt a new NoSQL database.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "9MGZ9T", "name": "Joel Lord", "avatar": "https://pretalx.com/media/avatars/9MGZ9T_N4qGqMg.webp", "biography": "Joel Lord (@joel__lord on Twitter) is passionate about the web and technology in general. He likes to learn new things, but most of all, he wants to share his discoveries. He does so by travelling at various conferences all across the globe.\nHe graduated from college in computer programming in the last millennium. Apart from a little break to get his BSc in computational astrophysics, he was always in the industry.\nIn his daily job, Joel is a developer advocate with MongoDB, where he connects with software engineers to help them make the web better by using best practices in web development.\nDuring his free time, he can be found stargazing on a campground somewhere or brewing a fresh batch of beer in his garage.", "public_name": "Joel Lord", "guid": "262788de-4500-51ff-8a49-1bbec83154c5", "url": "https://pretalx.com/bbuzz22/speaker/9MGZ9T/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/YKABW8/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/YKABW8/", "attachments": []}, {"guid": "15b29082-95f8-502b-9a66-9840ddb8a379", "code": "P9ZFJL", "id": 16182, "logo": "https://pretalx.com/media/bbuzz22/submissions/P9ZFJL/21.__Brehin_Vincent_-_Precup_Lucian_QkYKs9U.png", "date": "2022-06-13T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-16182-the-life-of-a-search-engine-administrator", "url": "https://pretalx.com/bbuzz22/talk/P9ZFJL/", "title": "The life of a search engine administrator", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Defining the KPIs, keeping an eye on the customer satisfaction and sales, defining the backlog, configuring the search engine, debugging relevance issues, preventing regressions \u2026 These are a few tasks on the list of a search engine administrator. A search engine is a living thing. Seasonality, levels of stocks, lifecycle of the products, marketing events, news, etc. are a few of the many factors that force the search engine to constantly evolve. In this context, the life of a search engine manager is tough. In this talk we describe the processes and tools that we put in place and help manage a search engine. We also address the limits between what can be automated and what still needs human supervision.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "DXAHHB", "name": "Lucian Precup", "avatar": "https://pretalx.com/media/avatars/DXAHHB_cVQTaMu.webp", "biography": "Lucian Precup is the CTO of [all.site](https://all.site/) - the collaborative search engine developed at [Station F](http://stationf.co) in Paris. With his colleagues at [Adelean](http://adelean.com), Lucian develops solutions for indexing, searching and analyzing data. Lucian regularly shares his knowledge in specialized conferences and organizes the [Search & Data Meetup](https://www.meetup.com/fr-FR/search-and-data/).", "public_name": "Lucian Precup", "guid": "035144ed-418f-562d-8a48-6a6f84e1ac41", "url": "https://pretalx.com/bbuzz22/speaker/DXAHHB/"}, {"code": "CCJXVT", "name": "Vincent Br\u00e9hin", "avatar": "https://pretalx.com/media/avatars/CCJXVT_kBiRGUJ.webp", "biography": "Search addict since 2006, Vincent has contributed to many Search engines integrations and Search Based Applications. With http://www.adelean.com he helps customers find their way to effective Search, from the business case up to the JVM tuning .", "public_name": "Vincent Br\u00e9hin", "guid": "fd8a149f-3c89-5b7b-9d8c-e709589162c8", "url": "https://pretalx.com/bbuzz22/speaker/CCJXVT/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/P9ZFJL/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/P9ZFJL/", "attachments": []}, {"guid": "d1751344-9bfe-5b57-9cf2-f331d7d23798", "code": "SXZTDG", "id": 16158, "logo": "https://pretalx.com/media/bbuzz22/submissions/SXZTDG/24._Irwin_Max_4r0zeQY.png", "date": "2022-06-13T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz22-16158-the-race-to-the-bottom-low-latency-in-the-age-of-the-transformer", "url": "https://pretalx.com/bbuzz22/talk/SXZTDG/", "title": "The Race to the Bottom - Low Latency in the age of the Transformer", "subtitle": "", "track": "Scale", "type": "Short Talk", "language": "en", "abstract": "So you want to deploy a large language model, and keep your latency SLA?  NLP adds enormous value to customers, but getting it to work efficiently is fraught with uncertainty and high cost.  As transformers and other big neural network architectures make their way into your platform, you may be finding it difficult to get the speed and throughput you need within your budget, or even understand why it is so expensive.\n\nThis talk will give an overview of the latency and throughput challenges, and how to solve them.  We will give an overview in the product and cost implications as well as the technical improvements that can be used to get things running fast.  We will compare solutions and help make sense of difficult to understand technology.\n\nThe audience will walk away with the information they need to decide on the best direction for inference in their production platform.\n\nKeywords: MLOps, Inference, Latency", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "3FZCNK", "name": "Max Irwin", "avatar": "https://pretalx.com/media/avatars/3FZCNK_iV4W8BH.webp", "biography": "Max Irwin is the founder of https://max.io, and is a contributing author of the book \"AI Powered Search\".  Prior to founding MAX.IO, he was Managing Consultant at OpenSource Connections, and was the founding leader of the Search Center of Excellence at Wolters Kluwer.\n\nMax has over 20 years of experience directing delivery and strategy of large scale applications in various industries, with 10 of those years globally managing large diverse teams to improve search quality to drive results. He has deep practical hands-on technical expertise in search relevance, customer experience, natural language processing, and growing quality-focused culture.", "public_name": "Max Irwin", "guid": "76adb1f7-f838-5e24-891b-76e0d3d4cd2d", "url": "https://pretalx.com/bbuzz22/speaker/3FZCNK/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/SXZTDG/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/SXZTDG/", "attachments": []}, {"guid": "5c028093-8bee-58e3-a8ad-5cd365fff567", "code": "EZVVNK", "id": 15783, "logo": "https://pretalx.com/media/bbuzz22/submissions/EZVVNK/28._Roy_Shubhro_ZKIEzFb.png", "date": "2022-06-13T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-15783-architecting-solr-indexing-pipelines-in-google-cloud-platform", "url": "https://pretalx.com/bbuzz22/talk/EZVVNK/", "title": "Architecting Solr indexing pipelines in Google Cloud Platform", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "The ubiquity of public cloud platforms has made it easy to offload operational overhead of maintaining on-premise systems and leverage the ability to scale these systems on-demand in a matter of minutes. But architecting a secure scalable systems in the public cloud comes with its own challenges. This problem is further complicated when you are migrating from an on-premise system. Such migrations often require infrastructure to operate in a hybrid state where some parts of the system have been migrated to the cloud while remaining components continue to run on-premise. We must also ensure that the migration is invisible to the user and there is no impact to overall availability of the system during this transition. Recently Box Search underwent such a migration for our Solr indexing pipeline and document store which involved migrating hundreds of terabytes of customer data from on-premise to GCP. In this talk we present the overall system architecture, the migration process and some of the challenges we encountered when running this system in a hybrid state.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "WHJCVU", "name": "Shubhro Jyoti Roy", "avatar": "https://pretalx.com/media/avatars/WHJCVU_1KtTdnN.webp", "biography": "Shubhro Roy is a Staff Engineer and Tech Lead on the Search Team at Box. His team is responsible powering search and discovery capabilities for Box which involves running and maintaining a petabyte scale search index on Solr. Prior to Box, he was building query engines for the Database group at Oracle. He has been working on distributed systems and information retrieval for 10+ years after graduating from Carnegie Mellon with Masters in Information Systems and Machine Learning.", "public_name": "Shubhro Jyoti Roy", "guid": "551f2688-9fe7-5c88-8278-f8d158f3af90", "url": "https://pretalx.com/bbuzz22/speaker/WHJCVU/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/EZVVNK/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/EZVVNK/", "attachments": []}], "Maschinenhaus": [{"guid": "c493c619-e6b4-5d42-b84a-40c0351e66a0", "code": "WYC9ZB", "id": 16046, "logo": "https://pretalx.com/media/bbuzz22/submissions/WYC9ZB/04._Schindler_Uwe_HbalW7O.png", "date": "2022-06-13T11:00:00+02:00", "start": "11:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16046-the-future-of-lucene-s-mmapdirectory-why-use-it-and-what-s-coming-with-java-19-and-later", "url": "https://pretalx.com/bbuzz22/talk/WYC9ZB/", "title": "The future of Lucene's MMapDirectory: Why use it and what's coming with Java 19 and later?", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Since version 3 of Apache Lucene and Solr and from the early beginning of Elasticsearch, the general recommendation was to use MMapDirectory as the implementation for index access on disk. But why is this so important?\n\nThis talk will first introduce the user about the technical details of memory mapping and why using other techniques slows down index access by a significant amount. Of course we no longer need to talk about 32/64bit Java VMs - everybody uses now 64 bits with Elasticsearch and Solr, but with current Java versions, Lucene still has some 32bit-like limitations on accessing the on-disk index with memory mapping. We will discuss those limitations especially with growing index size up to terabytes, and afterwards, Uwe will give an introduction to the new Java Foreign Memory Access API (JEP 370, JEP 383, JEP 393, JEP 412, JEP 419), that first appeared with Java 14, but still incubating.\n\nThis talk will give an overview of the the foreign memory API to be finalized and released to general availability in Java 19 and will present the current state of implementation in Lucene 10. Uwe will show how future versions of Lucene will be backed by next generation memory mapping and what needs to be done to make this usable in Solr and Elasticsearch - bringing you memory mapping for indexes with tens or maybe hundreds of Terabytes in the future!\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "HRJC87", "name": "Uwe Schindler", "avatar": "https://pretalx.com/media/avatars/HRJC87_iOiv7er.webp", "biography": "Uwe is committer and PMC member of Apache Lucene and Apache Solr. His main focus is on development of Lucene Core. He implemented fast numerical search and is maintaining the new attribute-based text analysis API. He studied Physics at the University of Erlangen-Nuremberg and works as managing director for SD DataSolutions GmbH in Bremen, Germany, a company that provides consulting and support for Apache Lucene, Elasticsearch, and Apache Solr. He also works for \u201cPANGAEA \u2013 Publishing Network for Geoscientific & Environmental Data\u201d where he implemented the portal's geo-spatial retrieval functions with Lucene Java. Uwe had talks about Lucene at various international conferences like the previous Berlin Buzzwords, ApacheCon EU/US, Lucene Revolution, Lucene Eurocon, and various local meetups.", "public_name": "Uwe Schindler", "guid": "ccd6817c-c966-5f38-ba01-e8e691b99c55", "url": "https://pretalx.com/bbuzz22/speaker/HRJC87/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/WYC9ZB/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/WYC9ZB/", "attachments": []}, {"guid": "d4b8f0b5-39e0-54e3-994b-97e23086df21", "code": "RSWMWB", "id": 16127, "logo": "https://pretalx.com/media/bbuzz22/submissions/RSWMWB/08._Pop_Radu_DTXn84v.png", "date": "2022-06-13T11:50:00+02:00", "start": "11:50", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16127-searching-through-large-graphs-using-elasticsearch", "url": "https://pretalx.com/bbuzz22/talk/RSWMWB/", "title": "Searching through large graphs using Elasticsearch", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "The National Audiovisual Institute (INA) is a repository of all French audiovisual archives, being responsible for archiving over 180 radio and television services, 24/7, since 1995. The generated metadata describing this content currently represents the equivalent of over 50 million documents (e.g.: images, audio and video fragments, text excerpts, etc.). Due to the heterogeneity of the content, the data model is directly inspired from the conceptual models of cultural heritage, represented by a large graph with complex relations between generic entities.\n\nThe challenge for building a global search engine for this particular use case is twofold: on one hand, the capacity to index and maintain the entire set of documents updated in a reasonable amount of time, and on the other hand the implementation of complex full text search capabilities with high performance.\n\nOur talk describes the key choices for the graph representation, facilitating the indexing process of the documents, as well as the technical framework set up around Elasticsearch, implementing dedicated search APIs required by different functional areas.\n\nWe also briefly mention the implementation optimisations that lead to a full process of 50 million documents in less than 48 hours, for an equivalent of 800GB Elasticsearch index.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "JNFKMQ", "name": "Radu Pop", "avatar": "https://pretalx.com/media/avatars/JNFKMQ_BxhO8Hb.webp", "biography": "Radu is providing Consulting Services as Solutions Architect at Adelean. He handles projects around Elasticsearch and Adelean\u2019s A2 search technology. He oversees the integration and evolution of search engines within large e-commerce platforms and marketplaces. Prior to joining Adelean, Radu acquired a solid experience in Web archiving, operating large scale crawling systems in the context of several European research projects. He holds a PhD in Computer Science and a MSc in Distributed Systems.", "public_name": "Radu Pop", "guid": "4d32fb92-f6a4-5c72-8144-f601b4c250b7", "url": "https://pretalx.com/bbuzz22/speaker/JNFKMQ/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/RSWMWB/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/RSWMWB/", "attachments": []}, {"guid": "eb48c15c-56af-594b-85e6-de3d914e8686", "code": "7TYXQN", "id": 16010, "logo": "https://pretalx.com/media/bbuzz22/submissions/7TYXQN/14._Bergum_Jo_Kristian_Cr8uKI0.png", "date": "2022-06-13T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16010-ai-powered-semantic-search-a-story-of-broken-promises", "url": "https://pretalx.com/bbuzz22/talk/7TYXQN/", "title": "AI-powered Semantic Search; A story of broken promises?", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Semantic search using AI-powered vector embeddings of text, where relevancy is measured using a vector similarity function, has been a hot topic for the last few years. As a result, platforms and solutions for vector search have been springing up like mushrooms. Even traditional search engines like Elasticsearch and Apache Solr ride the semantic vector search wave and now support fast but approximative vector search, a building block for supporting AI-powered semantic search at scale. \n\nUndoublty, sizeable pre-trained language models like BERT have revolutionized the state-of-the-art on data-rich text search relevancy datasets. However, the question search practitioners are asking themself is, do these models deliver on their promise of an improved search experience when applied to their domain? Furthermore, is semantic search the silver bullet which outcompetes traditional keyword-based search across many search use cases? This talk delves into these questions and demonstrates how these semantic models can dramatically fail to deliver their promise when used on unseen data in new domains.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "ZWB38G", "name": "Jo Kristian Bergum", "avatar": "https://pretalx.com/media/avatars/ZWB38G_UC6rwVn.webp", "biography": "Distinguished Engineer @Yahoo working on @vespaengine. Tweets about Vespa, search, recommendation, ranking, and IR.", "public_name": "Jo Kristian Bergum", "guid": "d0ade7cc-d910-542c-b1c4-d2e4ad51a63c", "url": "https://pretalx.com/bbuzz22/speaker/ZWB38G/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/7TYXQN/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/7TYXQN/", "attachments": []}, {"guid": "c6164d04-4b4b-51dd-850b-a7c5db2f61f6", "code": "M7DXXN", "id": 16083, "logo": "https://pretalx.com/media/bbuzz22/submissions/M7DXXN/18._Goodman_Richard_tJtZ5st.png", "date": "2022-06-13T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16083-using-solr-unconventionally-to-serve-26bn-documents", "url": "https://pretalx.com/bbuzz22/talk/M7DXXN/", "title": "Using Solr unconventionally to serve 26bn+ documents", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Learn how the Data Infrastructure team at Brandwatch rearchitected a group of their current Solr clusters and took a new approach in an unconventional manner. By splitting up the reads and writes, experimenting with Solr plugins, using S3, an application written in Rust and adopting the Solr Operator to spin up a cluster on Kubernetes, we were able to achieve our goal of having a cloud-based cluster which comfortably serves 26bn+ documents.\n\nYou'll understand the whys of our approach, things we discovered, what we have planned, and why rearchitecting things can be a difficult and strenuous task.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "39ZDGX", "name": "Richard Goodman", "avatar": "https://pretalx.com/media/avatars/39ZDGX_A8kskyS.webp", "biography": "Currently a Senior Data Infrastructure Engineer at Brandwatch in Brighton, UK, having joined fresh out of University several years ago. Working in the Data Infrastructure team is all about making Brandwatch's data safe, reliable and available. Using techs such as Solr, Kafka, HBase, Airflow, Postgres and Kubernetes. I developed an interest in Solr early on in my career and that has been my main focus working in this team. Over the last couple of years I began my journey in contributing to the open-source community submitting a few small patches for Solr.\n\nI take a keen interest in anything Solr, Python and monitoring, having spent a good time elevating our monitoring stack here at Brandwatch, overhauling our entire monitoring stack for Solr. More recently, I've lead a project to take a cluster from the data center to host in AWS in a cost-effective manner and have better scalability. \n\nOutside of work I take interest in classical music, being trained in Piano and Flute, sewing, D&D and video games.", "public_name": "Richard Goodman", "guid": "57b7be03-055d-5d20-b059-418c135f53db", "url": "https://pretalx.com/bbuzz22/speaker/39ZDGX/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/M7DXXN/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/M7DXXN/", "attachments": []}, {"guid": "3cd50ce7-7fc0-5600-968c-be1f98d79f90", "code": "MFKQZX", "id": 16145, "logo": "https://pretalx.com/media/bbuzz22/submissions/MFKQZX/22._Jasiskis_Andre_-_Torres_Joaquim_rT7FCmW.png", "date": "2022-06-13T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16145-the-perils-of-building-a-democratic-data-platform", "url": "https://pretalx.com/bbuzz22/talk/MFKQZX/", "title": "The perils of building a democratic data platform", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "It is clearly beneficial for an organization to make data-driven decisions,\ndecentralize access to data processing and empower every team to generate valuable information. \n\nThere are many ways to achieve these goals, but in an environment of rapid growth, building an accessible Data Platform is just the first step. What happens next determines its long-term success or its dramatic demise.\n\nIn this presentation, we discuss the main perils of building a platform that\nprocesses over 80000 unique datasets built by 1000 people across different\nteams, how to avoid them, and where to go from there.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "87NUJQ", "name": "Andre Jasiskis", "avatar": "https://pretalx.com/media/avatars/87NUJQ_6IGAED5.webp", "biography": "Andre Jasiskis is a founding member of Nubank's data platform, currently working on Nubank's streaming platform. Has been thinking a lot about data ingestion, fast batch, and streaming processing, specifically on how to handle the exponential growth in consumers and data volume.", "public_name": "Andre Jasiskis", "guid": "a6143038-2a73-52b1-8122-959aab1ba792", "url": "https://pretalx.com/bbuzz22/speaker/87NUJQ/"}, {"code": "7LCBUH", "name": "Joaquim Torres", "avatar": "https://pretalx.com/media/avatars/7LCBUH_T7tQwAY.webp", "biography": "Lead Software Engineer at Nubank, building streaming infrastructure to enable multiple teams to manage their real-time data products.\n\nHe is a generalist at heart with a penchant for distributed systems and platform software.", "public_name": "Joaquim Torres", "guid": "2bb6facf-3eae-598e-9699-f068dd87bdd8", "url": "https://pretalx.com/bbuzz22/speaker/7LCBUH/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/MFKQZX/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/MFKQZX/", "attachments": []}, {"guid": "3c642a0c-467a-52b4-8145-6c1ee4b547b0", "code": "LRFELQ", "id": 15102, "logo": "https://pretalx.com/media/bbuzz22/submissions/LRFELQ/Meriam_Simona_kOnJlju.png", "date": "2022-06-13T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz22-15102-logging-apache-spark-how-we-made-it-easy", "url": "https://pretalx.com/bbuzz22/talk/LRFELQ/", "title": "Logging Apache Spark - How we made it easy", "subtitle": "", "track": "Search", "type": "Short Talk", "language": "en", "abstract": "Are you familiar with the following Scenario?\n\nYou're running your Apache Spark app on EMR, and the log file gets pretty heavy. You try and open it through the AWS UI, or download it straight to your computer. You end up connecting to the server running your driver or any of your executors, relentlessly searching your logs while simultaneously looking at Ganglia and the Spark UI for additional logs and metrics.\n\nIf you are, this talk is exactly for you.\n\nLet me tell you how made it all easy with just some bootstrap actions, some bash scripts, Beats and Elastic. Customizable per app logging, with less searching of big log files and more looking into useful Kibana dashboards. This architecture is not nice to have, it's essential.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "MPVVAK", "name": "Simona Meriam", "avatar": "https://pretalx.com/media/avatars/MPVVAK_iHl1w55.webp", "biography": "Simona Meriam is a Senior Data Engineer at Aidoc, where she specializes in research and development of solutions for big data infrastructures. In her previous position as a Big Data Engineer at Nielsen, she researched and developed big data solutions using cutting-edge technologies such as Spark, Kafka, and Elasticsearch. In her spare time she enjoys talking, talking about music that you'll probably think is weird, Japan and data.", "public_name": "Simona Meriam", "guid": "bd425d5a-b436-506d-be60-9c46fa0df5fe", "url": "https://pretalx.com/bbuzz22/speaker/MPVVAK/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/LRFELQ/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/LRFELQ/", "attachments": []}, {"guid": "7f576ff1-f86d-5cfb-88e3-dc028d864148", "code": "FPSRQ9", "id": 15678, "logo": "https://pretalx.com/media/bbuzz22/submissions/FPSRQ9/03._Wu_Qi_O1WW2Vh.png", "date": "2022-06-13T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-15678-compress-giant-language-models-to-effective-and-resource-saving-models-using-knowledge-distillation", "url": "https://pretalx.com/bbuzz22/talk/FPSRQ9/", "title": "Compress giant language models to effective and resource-saving models using knowledge distillation", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Language models have drawn a lot of attention in NLP in recent years. Despite their short history of development, they have been employed and delivered astonishing performances in all sorts of NLP tasks, such as translation, question answering, information extraction and intelligent search.\n\nHowever, we should not forget that giant language models are not only data hungry, but also energy hungry. State-of-the-art language models such as BERT, RoBERTa and XLNet process millions of parameters, which is only possible with the help of dozens of sophisticated and expensive chips. The CO2 generated in the process is also massive. Being responsible for such high energy consumption is not easy in times of climate change.\n\nIn order for companies to benefit from the performance of state-of-the-art language models without putting too much strain on their computing costs, the models used must be reduced to a minimum. Of course, performance should not suffer as a result. One possible means to achieve this is the so-called knowledge distillation, which is one common technique among model compression methods. In this presentation, we will show you how you can use knowledge distillation to generate models that achieve comparable performances as state-of-the-art language models effectively, and in a resource-saving manner.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "KMAXGR", "name": "Qi Wu", "avatar": "https://pretalx.com/media/avatars/KMAXGR_E5x5PGf.webp", "biography": "Qi Wu works as a Machine Learning Engineer at ontolux to translate current research results into usable applications for our customers. She works on topics such as training and optimizing models, with a focus on finetuning and distillation. During her master's degree in statistics, she has already worked with Prof. Dr. Alan Akbik on the NLP framework FLAIR and worked on machine learning in the area of natural language processing, such as information extraction.", "public_name": "Qi Wu", "guid": "1d46997f-f99d-5c53-a4da-75bea7f978c2", "url": "https://pretalx.com/bbuzz22/speaker/KMAXGR/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/FPSRQ9/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/FPSRQ9/", "attachments": []}], "Frannz Salon": [{"guid": "94031870-6150-5cc2-b92e-1717fc3072a7", "code": "MGL3ZF", "id": 16146, "logo": "https://pretalx.com/media/bbuzz22/submissions/MGL3ZF/09._Blanco_Cordero_Javier_-_Neubauer_Tomas_0F3Jbi1.png", "date": "2022-06-13T11:50:00+02:00", "start": "11:50", "duration": "01:00", "room": "Frannz Salon", "slug": "bbuzz22-16146-live-build-how-to-harness-streaming-data-in-real-time-to-track-transform-and-build-on-heart-rate-data", "url": "https://pretalx.com/bbuzz22/talk/MGL3ZF/", "title": "Live build: How to harness streaming data in real time to track, transform and build on heart rate data", "subtitle": "", "track": "Stream", "type": "Workshop", "language": "en", "abstract": "This case study offers an entertaining way to learn about the possibilities of stream processing, which can be applied to projects in fields that require easy access to current information, such as finance, mobility and energy. We\u2019ll use the Quix platform to set up a series of open source data sets and code samples that collect, transform and deliver data under a machine learning model that learns to handle real-time heart rate data. We\u2019ll show how to include complex transformations to the data, such as how to calculate calories burned with Python.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "GSE7FN", "name": "Tom\u00e1\u0161 Neubauer", "avatar": "https://pretalx.com/media/avatars/GSE7FN_JQriUEQ.webp", "biography": "Tomas Neubauer is cofounder and CTO at Quix, responsible for the technical direction of the company across the full technical stack, and working as a technical authority for the engineering team.  He was previously technical lead at McLaren, where he led architecture uplift for Formula One racing realtime telemetry acquisition. He later led platform development outside motorsport, reusing the knowhow he gained from racing.", "public_name": "Tom\u00e1\u0161 Neubauer", "guid": "6e0cbd15-0c2a-50ad-b455-d649840f2893", "url": "https://pretalx.com/bbuzz22/speaker/GSE7FN/"}, {"code": "7Q3L8F", "name": "Javier Blanco Cordero", "avatar": "https://pretalx.com/media/avatars/7Q3L8F_nf6uHOG.webp", "biography": "Javier Blanco Cordero is a senior data scientist at Quix, where he helps customers get the most out of their data science projects. He was previously a senior data scientist at Orange, developing churn prediction, marketing mix modeling, propensity to purchase models and more. Javier is a master's lecturer and speaker specializing in pragmatic data science and causality.", "public_name": "Javier Blanco Cordero", "guid": "5aae339d-ded6-51e9-a732-cb12b74316cf", "url": "https://pretalx.com/bbuzz22/speaker/7Q3L8F/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/MGL3ZF/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/MGL3ZF/", "attachments": []}, {"guid": "bbd1c4f9-0f52-5c85-ba6d-b682a74d1eb1", "code": "G7JZV7", "id": 15230, "logo": "https://pretalx.com/media/bbuzz22/submissions/G7JZV7/15._Sarkar_Amrit_awmIfWF.png", "date": "2022-06-13T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-15230-kafka-monitoring-what-matters", "url": "https://pretalx.com/bbuzz22/talk/G7JZV7/", "title": "Kafka Monitoring: What Matters!", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "Due to Apache Kafka's widespread integration into enterprise-level infrastructures, monitoring Kafka performance at scale has become an increasingly important task. It can be challenging to understand what is happening in Kafka - both at the application level and lag performance, to successfully root cause/troubleshoot problems. To perform effective diagnosis, meaningful insights and visibility throughout all levels of the cluster are a must.\n\nThis talk will take a dive into what metrics or indicators matter most while running Kafka at Scale focusing on Lag performance. How to interpret and correlate these indicators, build dashboards and configure meaningful alerts to identify a probable issue to take place. This talk concludes with the idea of doing trend analysis to detect anomalies for long-running Kafka pipelines.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "XMVRDM", "name": "Amrit Sarkar", "avatar": "https://pretalx.com/media/avatars/XMVRDM_JFuP08Q.webp", "biography": "Amrit Sarkar is a Software Engineer at Apple India Pvt Ltd with 6+ years of experience in the search domain and big data, e-commerce, and product.", "public_name": "Amrit Sarkar", "guid": "255e9f5e-4b44-5d3f-93ba-c8ecdaa7c1a8", "url": "https://pretalx.com/bbuzz22/speaker/XMVRDM/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/G7JZV7/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/G7JZV7/", "attachments": []}, {"guid": "8ec6931f-6a34-5a92-8d7d-c3e22a0f8724", "code": "SYJWRU", "id": 16161, "logo": "https://pretalx.com/media/bbuzz22/submissions/SYJWRU/Hoenicke_Florian_-_Werk_Maximilian_5wHNpvQ.png", "date": "2022-06-13T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16161-neural-search-let-s-talk-about-quality", "url": "https://pretalx.com/bbuzz22/talk/SYJWRU/", "title": "Neural Search - Let's talk about quality", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "### Context\n\nIn the past year the interest in Neural Search and vector search engines increased heavily. They promise to solve multi modal, cross modal and semantic search problems with ease. However, when quickly trying Neural search with off-the-shelf pre-trained models the results are quite disillusioning. They lacking knowledge about the data at hand. In order to explicitly solve model finetuning for search problems we implemented an open-source finetuner. It is directly usable with several vector databases due to the underlying data structure.\n\n### Presentation\n\nIn our talk we present our methodology and performance on an example dataset. Afterwards, we show how well the approach transfers to other datasets, such as deepfashion, geolocation geoguessr and more. It will give hands-on guidance on how you can finetune a model in order to make your data better searchable.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "MBZKVT", "name": "Maximilian Werk", "avatar": "https://pretalx.com/media/avatars/MBZKVT_OMLdqG0.webp", "biography": "I enjoy bringing machine learning into production at Jina.ai as an engineering director. The combination of high quality engineering, digging into data and the real-world problem at hand thrills me.", "public_name": "Maximilian Werk", "guid": "2dc94080-444a-5437-a93d-7fc5a64e467f", "url": "https://pretalx.com/bbuzz22/speaker/MBZKVT/"}, {"code": "WQUX9K", "name": "Florian Hoenicke", "avatar": "https://pretalx.com/media/avatars/WQUX9K_3D5v4bU.webp", "biography": "When working in large organizations like SoundCloud, Deloitte and Axel-Springer I learned that the hardest challenges for tech companies are not of technical nature. As a Solution Lead at Jina, I analyze the challenges of our clients and come up with customized solutions. Based on these learnings, I propose changes to our Framework in order to push the quality and accessibility of neural search.", "public_name": "Florian Hoenicke", "guid": "f766274c-a567-5c2e-a6e6-b1be633aef54", "url": "https://pretalx.com/bbuzz22/speaker/WQUX9K/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/SYJWRU/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/SYJWRU/", "attachments": []}, {"guid": "5d327165-bf86-551b-923e-1208ed371eef", "code": "LRJRBX", "id": 16166, "logo": "https://pretalx.com/media/bbuzz22/submissions/LRJRBX/23._Walther_Timo_7fdJQ0w.png", "date": "2022-06-13T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16166-changelog-stream-processing-with-apache-flink", "url": "https://pretalx.com/bbuzz22/talk/LRJRBX/", "title": "Changelog Stream Processing with Apache Flink", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "We all know that the world is constantly changing. Data is continuously produced and thus should be consumed in a similar fashion by enterprise systems. Message queues and logs such as Apache Kafka can be found in almost every architecture, while databases and other batch systems still provide the foundation. Change Data Capture (CDC) has become popular to capture committed changes from a database and propagate those changes to downstream consumers.\n\nIn this talk, we will introduce Apache Flink as a general data processor for various kind of use cases on both finite and infinite streams. We demonstrate Flink's SQL engine as a changelog processor that is shipped with an ecosystem tailored to process CDC data and maintain materialized views. We will use Kafka as an upsert log, Debezium for connecting to databases, and enrich streams of various sources using different kinds of joins.\n\nFinally, we illustrate how to combine Flink's Table API with DataStream API for event-driven applications beyond SQL.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "QCJZ9N", "name": "Timo Walther", "avatar": "https://pretalx.com/media/avatars/QCJZ9N_wV42Tp9.webp", "biography": "Timo Walther is a long-term committer and PMC member of the Apache Flink project. He studied Computer Science at TU Berlin. Alongside his studies, he participated in the Database Systems and Information Management Group there and worked at IBM Germany. Timo worked as a software engineer and lead of the SQL team at Ververica. In Flink, he is working on various topics in the Table & SQL ecosystem.", "public_name": "Timo Walther", "guid": "5bf84a46-c47c-5645-94f9-b75be87b5300", "url": "https://pretalx.com/bbuzz22/speaker/QCJZ9N/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/LRJRBX/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/LRJRBX/", "attachments": []}, {"guid": "b31850c7-aff6-54c4-85b2-29cacaa56a7b", "code": "CZRCNG", "id": 16155, "logo": "https://pretalx.com/media/bbuzz22/submissions/CZRCNG/26._Angelatos_Petros_8CgsHRu.png", "date": "2022-06-13T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Frannz Salon", "slug": "bbuzz22-16155-change-data-capture-with-debezium-and-without", "url": "https://pretalx.com/bbuzz22/talk/CZRCNG/", "title": "Change data capture with Debezium\u2026and without", "subtitle": "", "track": "Stream", "type": "Short Talk", "language": "en", "abstract": "\"Change Data Capture (CDC) has become a mundane commodity, much in part due to the ever-rising success of [Debezium](https://debezium.io/). But what happens when you want to keep track of changes in your upstream database without having a message broker in your stack? In this talk, we\u2019ll walk through how we built a direct Postgres CDC connector at [Materialize](https://materialize.com/) to provide an alternative to our CDC support through Kafka+Debezium.\"", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "C99QE8", "name": "Petros Angelatos", "avatar": "https://pretalx.com/media/avatars/C99QE8_2R6MNIP.webp", "biography": "Petros is a software engineer at Materialize where he started the development of the Postgres CDC connector as a demo project. He is the co-founder and former CTO of [balena.io](balena.io), a platform for managing remote fleets of Linux devices.", "public_name": "Petros Angelatos", "guid": "4b00fa73-7c6d-5c14-80a8-2dc86f10c55a", "url": "https://pretalx.com/bbuzz22/speaker/C99QE8/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/CZRCNG/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/CZRCNG/", "attachments": []}, {"guid": "3d9e162b-2225-5116-9f10-259c073f77de", "code": "YSCAV8", "id": 16013, "logo": "https://pretalx.com/media/bbuzz22/submissions/YSCAV8/30._Tosca_Edo_q3OY7Y8.png", "date": "2022-06-13T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16013-entity-linking-at-scale-with-lucene", "url": "https://pretalx.com/bbuzz22/talk/YSCAV8/", "title": "Entity Linking at scale with Lucene", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "Signal AI offers a sophisticated platform to support businesses in their decision making. Customers define searches across billions of documents by using an extensive DSL that includes concepts like entities and topics amongst them.\nThis metadata is being extracted from over 5 million documents each day and is made  available to the end users within 30 seconds from its ingestion via a mix of machine learning and text retrieval techniques. \n\nEntity Linking is one of the core capabilities in the Signal AI data processing platform. It is a complex system that uses various strategies to achieve the highest quality while retaining excellent throughput characteristics.\n\nBack in 2019, one of the existing components of the Entity Linking system was rapidly reaching its limits and could not scale anymore.\nTo overcome the limitation, the team took an innovative approach and used Apache Lucene with its inverted index and term vectors capabilities to enable the identification of rule-based entities.\nBy choosing a percolator model the team had to revisit the previous architecture, breaking it down into smaller components that follow the Single Responsibility Principle for microservices.\n\nThis talk will take the audience through the evolution of this service, from its inception until today. It will provide details around the technical decisions and trade-offs that make this component one of the most resilient, fast and cost effective solutions, capable of handling 20 times more the number of rules at a fraction of the cost. It will also discuss how the same technology is used to reprocess the entire dataset every night in approximately 15 minutes.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "CRQBQW", "name": "Edoardo Tosca", "avatar": "https://pretalx.com/media/avatars/CRQBQW_wCMP9mI.webp", "biography": "Edoardo is an experienced software craftsman. He is obsessed with business problems and desperate to find the best technology to solve them. He is an open source enthusiast with a particular interest in search engines and machine learning.\n\nCurrently he is Head of Technology at Signal AI.", "public_name": "Edoardo Tosca", "guid": "b146ab5f-7275-56b7-ad84-317350d98539", "url": "https://pretalx.com/bbuzz22/speaker/CRQBQW/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/YSCAV8/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/YSCAV8/", "attachments": []}]}}, {"index": 3, "date": "2022-06-14", "day_start": "2022-06-14T04:00:00+02:00", "day_end": "2022-06-15T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "04939da1-65e3-58db-b518-2303b119c3f9", "code": "WNYRZF", "id": 16118, "logo": "https://pretalx.com/media/bbuzz22/submissions/WNYRZF/57._Arora_Atita_fMU40c8.png", "date": "2022-06-14T10:10:00+02:00", "start": "10:10", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz22-16118-understanding-vespa-with-a-lucene-mindset", "url": "https://pretalx.com/bbuzz22/talk/WNYRZF/", "title": "Understanding Vespa with a Lucene mindset", "subtitle": "", "track": "Search", "type": "Short Talk", "language": "en", "abstract": "Vespa is no more a 'new kid on the block' in the domain of search and big data. Everyone is wooed over reading about its capabilities in search, recommendation, and machine-learned aspects augmenting search especially for large data-sets. With so many great features to offer and so less documentation to how to get started on Vespa , we want to take an opportunity to introduce it to the lucene based search users. \nWe will cover about Vespa architecture , getting started , leveraging advance features , important aspects all in the analogies easier for someone with a fresh or lucene based search engines mindset.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "TRRRA8", "name": "Atita Arora", "avatar": "https://pretalx.com/media/avatars/TRRRA8_2nqHoJb.webp", "biography": "Atita works as a Search Relevance Consultant at OpenSource Connections. \nShe develops, consults, and optimize Enterprise & E-commerce search engines for more than 10 years. She holds 2 Masters degrees in Computer Applications and Strategic Business Management. She specializes in search platform migration to the cloud, building index pipelines, custom component development, and tuning relevance. She has a keen interest in personalizing search and influencing customer interaction using NLP, ML, and AI.", "public_name": "Atita Arora", "guid": "eee55867-8d23-504e-86da-f445cdf7b5ae", "url": "https://pretalx.com/bbuzz22/speaker/TRRRA8/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/WNYRZF/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/WNYRZF/", "attachments": []}, {"guid": "eaaa6a5b-334b-5de9-84d7-2be2a72a39a2", "code": "GJWBDH", "id": 18819, "logo": "https://pretalx.com/media/bbuzz22/submissions/GJWBDH/63._Menendez_Lara_Garcia_-_Mueller_Nina_Qst03au.png", "date": "2022-06-14T10:40:00+02:00", "start": "10:40", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-18819-goodbye-tracking-hello-privacy-the-technology-architecture-behind-ethical-search-discovery", "url": "https://pretalx.com/bbuzz22/talk/GJWBDH/", "title": "Goodbye Tracking, Hello Privacy: The Technology & Architecture behind Ethical Search & Discovery", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Search is a vital part of the online experience and for many brands a key way to interact with their customers. Yet search results are too often derived from data collected by trackers and analytics, tools that disrespect human rights and GDPR or CCPA regulations. In this talk, we'll outline the negative impact of tracking while exploring alternative solutions that actively protect privacy without detracting from the search experience. \n\nKey takeaways:\n\n- Learn the key principles of a privacy-first platform architecture \n- Explore high demand performance stability in a data protected environment\n- Liberty of liability: look, but don\u2019t touch personal data\n\nThis talk is sponsored by Empathy.co", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "QJULAB", "name": "Nina M\u00fcller", "avatar": "https://pretalx.com/media/avatars/QJULAB_6cARSJl.webp", "biography": "Nina is the Ethical Commerce Alliance Director at Empathy.co. She has lots of marketing and event organisation experience, especially in the tech industry having run Berlin Buzzwords and other conferences before. \nIn her current role, Nina is responsible for establishing partnerships and connecting people in a network committed to privacy, data and ethics towards a more human-centric approach to the online world.\nWhen she\u2019s not at her desk, Nina enjoys music and cooking, has a weakness for licorice and loves the cinema.", "public_name": "Nina M\u00fcller", "guid": "b1b360aa-c361-56fc-b3d2-a8b991eaa304", "url": "https://pretalx.com/bbuzz22/speaker/QJULAB/"}, {"code": "CL9ZZH", "name": "Lara Men\u00e9ndez Garc\u00eda", "avatar": "https://pretalx.com/media/avatars/CL9ZZH_MZs2Oxe.webp", "biography": "Lara Menendez is Product Director and member of the Executive Board at Empathy.co, a leading innovator in commerce search and discovery. She is responsible for all aspects of the product. Her role is critical in the product strategy, focusing on commerce search experiences that evoke positive emotions.\nPrior to joining Empathy.co, Lara\u2019s work experience included data engineering and analytics positions with Pull&Bear and Merkle. Her crucial role in overseeing global product management at Empathy is reflected in how she leads the product team: A team that is committed to building human-centred commerce search and discovery platforms that empower big brands to create trustworthy, understanding and joyful experiences.\nAfter living in Asturias and Galicia (Spain), she is now based in London as part of Empathy\u2019s team in their UK headquarters. She loves London vibes but she misses Spanish gastronomy. When she\u2019s not at her desk, you can find Lara enjoying a good Spanish wine.", "public_name": "Lara Men\u00e9ndez Garc\u00eda", "guid": "1aca1a7a-962e-5265-8e67-d7565bff36e9", "url": "https://pretalx.com/bbuzz22/speaker/CL9ZZH/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/GJWBDH/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/GJWBDH/", "attachments": []}, {"guid": "3ff9845a-a734-5b8e-964a-a8bc1b5a27d9", "code": "A97PXE", "id": 16078, "logo": "https://pretalx.com/media/bbuzz22/submissions/A97PXE/48._Dubrovsky_Opher_-_Nadler_Ido_TX5aGUG.png", "date": "2022-06-14T11:30:00+02:00", "start": "11:30", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16078-scaling-your-kafka-pipeline-can-be-a-pain-but-it-doesn-t-have-to-be", "url": "https://pretalx.com/bbuzz22/talk/A97PXE/", "title": "Scaling your Kafka pipeline can be a pain - but it doesn\u2019t have to be!!", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "Kafka data pipeline maintenance can be painful.\nIt usually comes with complicated and lengthy recovery processes, scaling difficulties,  traffic \u2018moodiness\u2019, and latency issues after downtimes and outages.\n\nIt doesn\u2019t have to be that way! \n\nWe\u2019ll examine one of our multi-petabyte scale Kafka pipelines, and go over some of the pitfalls we\u2019ve encountered. We\u2019ll offer solutions that alleviate those problems, and go over comparisons between the before and after . We\u2019ll then explain why some common sense solutions do not work well and offer an improved, scalable and resilient way of processing your stream.\n\nWe\u2019ll cover:\n- Costs of processing in stream compared to in batch\n- Scaling out for bursts and reprocessing \n- Making the tradeoff between wait times and costs\n- Recovering from outages \n- And much more\u2026", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "P7AKPF", "name": "Opher Dubrovsky", "avatar": "https://pretalx.com/media/avatars/P7AKPF_YgxDBIg.webp", "biography": "I am a director of data engineering at Nielsen. \nMy group builds massive data pipelines that are cost effective and scalable (~250 Billion events/day). Our projects run on AWS, using Kafka, Spark, Aerospike, serverless Lambda functions, Airflow, OpenFAAS, Kubernetes and more.\nI am passionate about new technologies, data, algorithms and machine learning. I love to tackle difficult problems and come up with amazing solutions to them. \nI have 4 patents in the area of security, and lots of ideas for more..", "public_name": "Opher Dubrovsky", "guid": "d0139766-1f46-55cb-b930-9d41250936e8", "url": "https://pretalx.com/bbuzz22/speaker/P7AKPF/"}, {"code": "CDSGJF", "name": "Ido Nadler", "avatar": "https://pretalx.com/media/avatars/CDSGJF_q9CsuSH.webp", "biography": "I am a big data team lead at Nielsen. \nMy team focuses on building massive data pipelines (~250 Billion events/day) and infrastructure for running machine learning algorithms. Our projects run on AWS using a variety of technologies like Kafka, Spark, Airflow, Kubernetes, and more. \nI like to continuously experiment with new technologies, tackle challenging problems, and find those better, more elegant, and cost-effective solutions.", "public_name": "Ido Nadler", "guid": "15c485ee-16aa-54e8-bc78-bb04d390e542", "url": "https://pretalx.com/bbuzz22/speaker/CDSGJF/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/A97PXE/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/A97PXE/", "attachments": []}, {"guid": "03d0157c-71e2-5a1d-af8d-b4b6457696b6", "code": "DHJRQC", "id": 15256, "logo": "https://pretalx.com/media/bbuzz22/submissions/DHJRQC/42._Fernandez_Ramiro_Alvaro_-_Hernandez_Daniel_-_Panizo_Alvar_uRsr0T6.png", "date": "2022-06-14T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-15256-running-apache-spark-on-k8s-from-aws-emr-to-k8s", "url": "https://pretalx.com/bbuzz22/talk/DHJRQC/", "title": "Running Apache Spark on K8s: From AWS EMR to K8s", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "Spark is a trend technology that it is being used for a lot of companies for large-scale data analytics. During the first try, companies usually try to use the cloud provider solution to speed up their time to market, but once Spark is broadly embrace by more teams in the company and the solution should be able to be multi cloud provider, then the Kubernetes adoption appear and the journey to make it happen its worth to share to inspire others in the same situation. In this talk the audience will learn some benefits to migrate from AWS EMR to Spark on Kubernetes, from operability point of view (reliability, portability, scalability), through observability and finally reviewing efficiency and costs. This talk is a real use case three teams at Empathy.co were working during 6 months to make their solution more agnostic and with minimum cloud dependencies.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "V8GBBD", "name": "Ramiro Alvarez Fernandez", "avatar": "https://pretalx.com/media/avatars/V8GBBD_RfxK9ge.webp", "biography": "I\u2019m a Senior DevOps Engineer currently working as Tech Lead of the Platform Engineering Team at Empathy.co. I mostly manage Kubernetes Clusters, CI/CD orchestration, Elasticsearch, MongoDB and try to break things on AWS, GCP and Azure. I'm a big fan of Anton Babenko and if I'm not online you can find me on Pagerduty.", "public_name": "Ramiro Alvarez Fernandez", "guid": "574adfc4-c7b8-5129-9672-263c22351766", "url": "https://pretalx.com/bbuzz22/speaker/V8GBBD/"}, {"code": "ADUJZF", "name": "\u00c1lvaro Panizo", "avatar": "https://pretalx.com/media/avatars/ADUJZF_c7HVwLo.webp", "biography": "DataScience&Search Product Owner and developer in Empathy. I worked in all the search areas in the last years, from the relevancy to data science and from the pure backend to manage the merchandiser and customer needs.\nExperience with high availability systems using k8s, different cloud providers. Talking about search and search intelligence I usually have fun with technologies like Spark or Elasticsearch but also love multidisciplinary teams with knowledge over all the development process (CI/CD, metrics, performance...).\n\nAbout the real me, I like to travel, be (more or less) healthy and plants but... who doesn't?\nPlease if you see me around pay a drink.", "public_name": "\u00c1lvaro Panizo", "guid": "0c76ac60-5ffc-5e26-9fd0-633f5ba197c1", "url": "https://pretalx.com/bbuzz22/speaker/ADUJZF/"}, {"code": "MUU3QC", "name": "Daniel Hern\u00e1ndez Alfageme", "avatar": "https://pretalx.com/media/avatars/MUU3QC_3cwipTI.webp", "biography": "I am a software engineer working as a Data Engineer for Empathy.co. My work is focused on building and managing ETL pipelines that feed our search engine with contextual information to improve the search experience on the final users and provide aggregated analytics to merchandisers, always keeping privacy in mind. I am passionate about data engineering and I perform my daily work with technologies like Apache Flink, Apache Spark, MongoDB running in AWS or GCP.", "public_name": "Daniel Hern\u00e1ndez Alfageme", "guid": "28b4ee97-b8e3-5cb0-b59a-3d8db1e9fe41", "url": "https://pretalx.com/bbuzz22/speaker/MUU3QC/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/DHJRQC/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/DHJRQC/", "attachments": []}, {"guid": "45ca7701-8d6f-5169-a855-cbf4de437396", "code": "UYZAUX", "id": 16032, "logo": "https://pretalx.com/media/bbuzz22/submissions/UYZAUX/46._Antuzi_Daniele_-_Petreti_Ilaria_gMVMnyn.png", "date": "2022-06-14T14:40:00+02:00", "start": "14:40", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16032-word2vec-model-to-generate-synonyms-on-the-fly-in-apache-lucene", "url": "https://pretalx.com/bbuzz22/talk/UYZAUX/", "title": "Word2Vec model to generate synonyms on the fly in Apache Lucene", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "If you want to expand your query/documents with synonyms in Apache Lucene, you need to have a predefined file containing the list of terms that share the same semantic.\nIt's not always easy to find a list of basic synonyms for a language and, even if you find it, this doesn\u2019t necessarily match with your contextual domain.\nThe term \"daemon\" in the domain of operating system articles is not a synonym of \"devil\" but it's closer to the term \"process\".\n\nWord2Vec is a two-layer neural network that takes as input a text and outputs a vector representation for each word in the dictionary.\nTwo words with similar meanings are identified with two vectors close to each other.\n\nThis talk explores our contribution to Apache Lucene that integrates this technique with the text analysis pipeline.\nWe will show how you can automatically generate synonyms on the fly from an Apache Lucene index and how you can use this new feature along with Apache Solr with practical examples!\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "SQCBZK", "name": "Daniele Antuzi", "avatar": "https://pretalx.com/media/avatars/SQCBZK_eKyVhCo.webp", "biography": "Daniele Antuzi is a software engineer passionate about high-performance data structures and algorithms. He has been working for 4 years in finance (List spa) and 2 years in cloud services (Amazon Web Services) but the curiosity to learn more about information retrieval brings him to join Sease Ltd.\nHe likes studying and experimenting with new technologies trying to reduce the gap between academia and industry.", "public_name": "Daniele Antuzi", "guid": "6866aac0-4350-57ba-b3b3-3195229ae517", "url": "https://pretalx.com/bbuzz22/speaker/SQCBZK/"}, {"code": "CBFBFW", "name": "Ilaria Petreti", "avatar": "https://pretalx.com/media/avatars/CBFBFW_tBd7ngd.webp", "biography": "Ilaria is an Information Retrieval/Machine Learning engineer at Sease. Strongly believing in the power of Big Data and Digital Transformation, she got a master in Data Science. \nShe loves the application of data mining and machine learning methods to information retrieval problems. Currently, she is involved in Learning to Rank projects.", "public_name": "Ilaria Petreti", "guid": "4ffc777b-83be-5622-9d8d-5f2e905ab504", "url": "https://pretalx.com/bbuzz22/speaker/CBFBFW/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/UYZAUX/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/UYZAUX/", "attachments": []}, {"guid": "1faa5d3d-d362-52fa-b83e-38157b27ad1c", "code": "D3TKUX", "id": 16197, "logo": "https://pretalx.com/media/bbuzz22/submissions/D3TKUX/61._Nioche_Julien_gSAZd74.png", "date": "2022-06-14T15:20:00+02:00", "start": "15:20", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz22-16197-url-frontier-an-open-source-api-and-implementation-for-crawl-frontiers", "url": "https://pretalx.com/bbuzz22/talk/D3TKUX/", "title": "URL Frontier, an open source API and implementation for crawl frontiers", "subtitle": "", "track": "Store", "type": "Short Talk", "language": "en", "abstract": "This talk will present URLFrontier, an API and service implementation of a crawl frontier. After an introduction to how it fits in a distributed crawl architecture, we will go in more details on what the project provides, how it has been used so far and future works.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "UA3XLB", "name": "Julien Nioche", "avatar": "https://pretalx.com/media/avatars/UA3XLB_7eLQsxV.webp", "biography": "Having studied Russian language and culture in Paris and taught French in a school in Kiev, Ukraine, Julien went on to graduate in Text Engineering and Natural Language Processing. He moved to the UK to work as a researcher at the University of Sheffield in 2005 and founded DigitalPebble in 2008.\n\nJulien has been involved in several open source projects, mainly at the Apache Software Foundation, and was the PMC chair for Apache Nutch. He is an Emeritus member of the Apache Software Foundation.\n\nJulien runs workshops on web crawling, speaks at conferences and reviews technical books. He has over 20 years experience in the Java programming language.", "public_name": "Julien Nioche", "guid": "c0cf7916-a646-5343-998d-bf259ba217b0", "url": "https://pretalx.com/bbuzz22/speaker/UA3XLB/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/D3TKUX/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/D3TKUX/", "attachments": []}, {"guid": "655ffb02-aac9-543b-8d27-8faabea2d5e1", "code": "SPLQ9G", "id": 16091, "logo": "https://pretalx.com/media/bbuzz22/submissions/SPLQ9G/60._Schubert_Christoph_KzTnFtB.png", "date": "2022-06-14T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-16091-patterns-and-anti-patterns-for-production-ready-kafka-streams-apps", "url": "https://pretalx.com/bbuzz22/talk/SPLQ9G/", "title": "Patterns and anti-patterns for production ready Kafka Streams apps", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "Kafka Streams is a library for developing streaming application with Apache Kafka.\nWe will discuss best practices for developing a production-ready Kafka Streams application and for running it smoothly in production.\nAfter reviewing the fundamentals of stateless and especially stateful programming with Kafka Streams, we will address the following questions:\n\n- How to prepare your application for seamless failover?\n\n- How to deal with the ever-growing table anti-pattern and properly implement TTL?\n\n- How to prevent resource-leaks when dealing with RocksDB-based state stores?\n\n- Which metrics to monitor?\n\n- How to size your runtime environment?\n\n- What should we keep in mind when deploying Kafka Streams on Kubernetes?\n\n- How to best deal with evolving data models?", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "8VSSGF", "name": "Christoph Schubert", "avatar": "https://pretalx.com/media/avatars/8VSSGF_wPsdX6b.webp", "biography": "Christoph is a Solutions Architect with Confluent. Before joining Confluent, he developed streaming applications for the financial services industry and mobile applications for about everyone else. He holds a PhD in Mathematics and has a long-lasting interest in developing performant and secure software systems.", "public_name": "Christoph Schubert", "guid": "169fc47d-27c4-5758-876a-c865be6ae8c2", "url": "https://pretalx.com/bbuzz22/speaker/8VSSGF/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/SPLQ9G/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/SPLQ9G/", "attachments": []}, {"guid": "d1da0c9e-5fbf-5d97-8183-79dda3cb395a", "code": "QDWHYU", "id": 15963, "logo": "https://pretalx.com/media/bbuzz22/submissions/QDWHYU/58._Lofcali_Hakan_0FhT0Iv.png", "date": "2022-06-14T16:50:00+02:00", "start": "16:50", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz22-15963-cloud-native-etl-with-java-quarkus-kubernetes-and-jib-container-builder", "url": "https://pretalx.com/bbuzz22/talk/QDWHYU/", "title": "Cloud-native ETL with Java Quarkus, Kubernetes, and Jib Container Builder", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "DataCater unlocks more value from organizations' data, faster. This talk walks you through our stack, architecture, and processes. We develop tools to deploy and run data-driven applications in a cloud-native environment.\n\nWe will give a whirlwind tour on developing a java Quarkus application, a CICD stack powered by Github Actions / ArgoCD, building and deploying containerized Kafka Streams applications at runtime with Jib container builder.\n\nHaving introduced the above common understanding, we will give a high-level overview of how we utilize modern Kubernetes and Cloud tooling to manage multiple clusters in different organizations together with our customers.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "GR3EPA", "name": "Hakan Lofcali", "avatar": "https://pretalx.com/media/avatars/GR3EPA_U1u1ina.webp", "biography": "Hakan is a Software / Data Engineer. He worked and built his knowledge around Software, Data Engineering, and Cloud-Native Computing in different environments. From early start-up to AWS. From sports media companies to highly regulated FSI enterprises. The experiences gained, problems encountered, and solutions found led to him co-founding DataCater to enhance tooling in the Data space.", "public_name": "Hakan Lofcali", "guid": "eb85a2fb-46b6-54d2-bb62-bc30024ba5ca", "url": "https://pretalx.com/bbuzz22/speaker/GR3EPA/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/QDWHYU/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/QDWHYU/", "attachments": []}], "Palais Atelier": [{"guid": "3beb964c-d0d5-50c9-b5d6-64eb59152b7a", "code": "HVP9BD", "id": 15870, "logo": "https://pretalx.com/media/bbuzz22/submissions/HVP9BD/59._Korad_Minakshi_eiYirnX.png", "date": "2022-06-14T10:10:00+02:00", "start": "10:10", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz22-15870-min-and-max-aggregations-with-updates-in-real-time", "url": "https://pretalx.com/bbuzz22/talk/HVP9BD/", "title": "Min and Max Aggregations with Updates in Real Time.", "subtitle": "", "track": "Stream", "type": "Short Talk", "language": "en", "abstract": "As part of our analytics platform we handle real time ingestion and aggregations by performing aggregations such as count and sum based on roll ups such as day, hour and minute using Kafka streams app. We recently added support for Min and Max measures along with existing sum and count while performing aggregations on incoming Kafka records. The interesting part is that we also support all these aggregations on Updated records. This talk aims at exploring the interesting details that went into adding the Min and Max functionality to our Kafka streams app while performing real-time aggregations with Updates.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "BEAWSD", "name": "Minakshi Korad", "avatar": "https://pretalx.com/media/avatars/BEAWSD_YD8OKyY.webp", "biography": "I am a Senior Software Engineer working in the Enterprise Insights team at Twilio. I am always excited about distributed systems, data driven technologies also interested in scalability and performance aspects of the product. I am always on the lookout for learning new technologies and optimizing the current usecases.", "public_name": "Minakshi Korad", "guid": "dc2bba58-8f7e-5a38-a289-039e3761854f", "url": "https://pretalx.com/bbuzz22/speaker/BEAWSD/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/HVP9BD/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/HVP9BD/", "attachments": []}, {"guid": "8e61e409-d1c1-5a67-b9dd-08af5eb2da19", "code": "XFVS9W", "id": 16164, "logo": "https://pretalx.com/media/bbuzz22/submissions/XFVS9W/36._Ferragut_Sergio_HoHn22H.png", "date": "2022-06-14T11:30:00+02:00", "start": "11:30", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-16164-build-real-time-analytic-applications-the-easy-way", "url": "https://pretalx.com/bbuzz22/talk/XFVS9W/", "title": "Build Real-time Analytic Applications: The Easy Way.", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "Apache Druid is the open source analytics database that enables development of modern data-intensive applications of any size. It provides sub-second response times on streaming and historical data and can scale to deliver real-time analytics with data ingestion at any data flow rate \u2013 with lightning fast queries at any concurrency. \n\nSounds great, right?  But any large distributed system can be difficult and time-consuming to deploy and monitor. Deployment requirements change significantly from use case to use case, from dev/test clusters on the laptop to hundreds of nodes in the cloud.  Kubernetes has become the de-facto standard for making these complicated systems be much easier to deploy and operate. \n\nIn this talk you will learn about Druid's microservice architecture and the benefits of deploying it on Kubernetes. We will walk you through the open source project's Helm Chart design and how it can be used to deploy and manage clusters of any size with ease.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "BBUPCU", "name": "Sergio Ferragut", "avatar": "https://pretalx.com/media/avatars/BBUPCU_fLsplXl.webp", "biography": "Sergio Ferragut is a database veteran turned Developer Advocate. His experience includes 16 years at Teradata in professional services and engineering roles. He has direct experience in building analytic applications spanning the retail, supply chain, pricing optimization and IoT spaces. Sergio has worked at multiple technology start-ups including APL and Splice Machine where he helped guide product design and field messaging. He joined Imply as a Developer Advocate in 2021 to help and learn from the Apache Druid open source community.", "public_name": "Sergio Ferragut", "guid": "cefbe1ca-4157-5af6-a720-f5005af3954c", "url": "https://pretalx.com/bbuzz22/speaker/BBUPCU/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/XFVS9W/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/XFVS9W/", "attachments": []}, {"guid": "595ffa90-7fc0-5013-9301-f9efdb310622", "code": "R9WGGV", "id": 15979, "logo": "https://pretalx.com/media/bbuzz22/submissions/R9WGGV/39._Kolawole_Steven_n7yOCV1.png", "date": "2022-06-14T12:20:00+02:00", "start": "12:20", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-15979-open-science-building-models-like-we-build-open-source-software", "url": "https://pretalx.com/bbuzz22/talk/R9WGGV/", "title": "Open Science: Building Models Like We Build Open-Source Software", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "Elevator pitch\nThe use of transfer learning has begun a golden era in applications of ML but the development of these models \u201cdemocratically\u201d is still in the dark ages compared to best practices in SWE. I describe how methods of open-source SWE can allow models to be built by a distributed community of researchers.\n---\nOver the past few years, it has become increasingly common to use transfer learning when tackling machine learning problems (e.g. the BERT model on HuggingFace Hub has been downloaded tens of millions of times). However, pre-training often involves training a large model on a large amount of data. This incurs substantial computational (and therefore financial) costs; for example, Lambda estimates that training the GPT-3 language model would cost around $4.6 million. As a result, the most popular pre-trained models are being created by small teams within large, resource-rich corporations. This means that the majority of the research community is excluded from participating in the design and creation of these valuable resources.\n\nHere, I elaborate on why we should develop tools that will allow us to build pre-trained models in the same way that we build open-source software. Specifically, models should be developed by a large community of stakeholders who continually update and improve them. Realizing this goal will require porting many ideas from open-source software development to building and training models, which motivates many threads of interesting research and opens up machine learning research for much larger participation.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "LKGJ7S", "name": "Steven Kolawole", "avatar": "https://pretalx.com/media/avatars/LKGJ7S_Lq9mVNF.webp", "biography": "Steven Kolawole has his technical skillset cuts across Data Science and Software Engineering, with a bias for ML Research these days. His research interests focus on resource-efficient machine learning in terms of computational resources and low-resource/limited labeled data.\n\nHe is and has been heavily involved in varieties of ML subfields including ML Engineering, Software Engineering, Data Engineering, Data Science/Analytics, and Cloud Computing.\n\nSteven is also big on knowledge sharing via community mentorship and collective growth, open-source development, meetups facilitation, speakership, technical writing, research, and he gets kicks from helping tech muggles find their feet.", "public_name": "Steven Kolawole", "guid": "f10d9265-1bc4-55bc-a1a8-6d0d8cb48f7f", "url": "https://pretalx.com/bbuzz22/speaker/LKGJ7S/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/R9WGGV/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/R9WGGV/", "attachments": []}, {"guid": "3b9a1064-3703-5b10-b528-e0cc57374e84", "code": "DNPKX8", "id": 14711, "logo": "https://pretalx.com/media/bbuzz22/submissions/DNPKX8/43._Ferreira_Ricardo_TVqkLh7.png", "date": "2022-06-14T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-14711-do-it-yourself-programmable-metrics-using-opentelemetry", "url": "https://pretalx.com/bbuzz22/talk/DNPKX8/", "title": "Do It Yourself: Programmable Metrics using OpenTelemetry", "subtitle": "", "track": "Store", "type": "Talk", "language": "en", "abstract": "Using metrics to measure how good or bad things are going is a proven way to ensure a software-based system is going in the right direction. Most metrics are created and monitored automatically by agent technologies installed in our infrastructure, making us hostages of the set of metrics that these agents are programmed to address. But what if you need to handle your own set of metrics?\n\nThis is a question that often drives developers mad because they fear spending development cycles building something that will end up being locked into a particular monitoring/observability vendor. But OpenTelemetry \u2014 a CNCF observability framework that provides a vendor-neutral approach to tackle metrics, logging, and tracing needs, can change everything.\n\nThis talk will explain how the OpenTelemetry framework allows the creation of custom metrics in a standard, scalable, and reusable way. It will provide an example in Java of a set of metrics that are continuously updated based on the execution of the code and how to hook that data with a compatible observability backend.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "8TM93C", "name": "Ricardo Ferreira", "avatar": "https://pretalx.com/media/avatars/8TM93C_SeekFUi.webp", "biography": "Ricardo is Senior Developer Advocate at AWS, working in the developer relations team for North America. With +20 years of experience, he may have learned a thing or two about distributed systems, fast data analytics, software architecture, databases, and observability. Before joining AWS, he worked for software vendors like Elastic, Confluent, and Oracle. Ricardo is known for his natural ability to explain complex topics. He craftily breaks them down into bite-sized pieces until anyone can understand.\n\nWhile not working, he loves barbecuing in his backyard with his family and friends, where he finally gets the chance to talk about anything unrelated to computers. He currently lives in North Carolina, USA, with his wife and son. Follow Ricardo on Twitter: @riferrei.", "public_name": "Ricardo Ferreira", "guid": "ee12dfd2-9b21-541a-b74f-a8339139515e", "url": "https://pretalx.com/bbuzz22/speaker/8TM93C/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/DNPKX8/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/DNPKX8/", "attachments": []}, {"guid": "2539866f-ef19-5944-a0ed-068896505808", "code": "7VP3E7", "id": 16168, "logo": "https://pretalx.com/media/bbuzz22/submissions/7VP3E7/35._Gupta_Anshum_UH54RZt.png", "date": "2022-06-14T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-16168-what-s-new-in-apache-solr-9-0", "url": "https://pretalx.com/bbuzz22/talk/7VP3E7/", "title": "What's new in Apache Solr 9.0", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Apache Solr 9.0 might be among the most anticipated release for the project in the last decade for Solr. \n\nFor folks who don't follow the project very closely, the list of changes is a lot to comprehend and digest. This talk would make that process easy for the developers by highlighting some key aspects of the 9.0 release. \n\nDuring this talk, I'd cover the migration of the Solr build system to Gradle and what it means for developers who work with Solr. I will also talk about updates to modules like the movement of HDFS into a non-core plugin and the removal of auto-scaling framework, CDCR, and DIH. \n\nIn addition, this talk would also showcase some of the key security, scalability, and stability improvements  that Solr 9.0 brings to the users.\n\nAt the end of this talk, the attendees would have a better understanding of the Solr 9.0 release and a high level road map for the project allowing them to plan better.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "88SDJF", "name": "Anshum Gupta", "avatar": "https://pretalx.com/media/avatars/88SDJF_qK31J86.webp", "biography": "Anshum is an Apache Lucene and Solr committer and Project Management Committee member. He started dabbling with Lucene about 15 years ago, and since then has worked at various organizations building both internal and consumer facing search platforms on top of Lucene and Solr. He is currently a part of Apple's Open Source Technologies group.", "public_name": "Anshum Gupta", "guid": "6cee6191-f825-5d91-b036-242514136cb0", "url": "https://pretalx.com/bbuzz22/speaker/88SDJF/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/7VP3E7/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/7VP3E7/", "attachments": []}, {"guid": "5ecb6bad-6248-5391-8968-ef766d0061cf", "code": "HZQPUF", "id": 16077, "logo": "https://pretalx.com/media/bbuzz22/submissions/HZQPUF/50._Liu_Frank_MOSaLXs.png", "date": "2022-06-14T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-16077-building-an-open-source-framework-for-generating-embedding-vectors", "url": "https://pretalx.com/bbuzz22/talk/HZQPUF/", "title": "Building an Open-source Framework for Generating Embedding Vectors", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "The combination of big data and deep learning has fundamentally changed the way we approach search systems, allowing us to index audio, images, video, and other human-generated data based on an embedding vector instead of an auxiliary description. These advancements are backed by new and often times increasingly complex machine learning (ML) models, leading to an even wider research-to-industry gap despite the introduction of MLOps platforms and a variety of model hubs. We summarize some of the challenges facing practical machine learning in 2022 and beyond as follows: 1) many ML applications require a combination of multiple models, leading to a lot of overly complex and difficult-to-maintain auxiliary code, 2) many engineers are unfamiliar with ML and/or data science, making it difficult for them to train, test, and integrate ML models into existing infrastructure, and 3) constant architectural updates to SOTA deep learning models creates significant overhead when deploying said models in production environments.\n\nIn this talk, we discuss lessons learned from building an open-source (https://github.com/towhee-io/towhee) and scalable framework for generating embedding vectors purpose-built to tackle the above challenges. Early on, we communicated with dozens of industry partners to understand their application(s) and architected our platform around their requirements. This open source project is currently being used by 3 major corporations ($10B+ market value) and a number of small- and mid-size startups in proof-of-concept and production systems.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "HLP7KS", "name": "Frank Liu", "avatar": "https://pretalx.com/media/avatars/HLP7KS_ifMdQfu.webp", "biography": "Frank Liu is an engineering manager and tech lead at Zilliz, where he leads a team of multi-national engineers while serving as a maintainer for the Towhee open-source project (https://towhee.io). In a span of less than three months, he and his team turned Towhee into an easy-to-use platform in use by 3 major corporations ($10B+ market value) and a number of small- and mid-size startups in proof-of-concept and production systems.\n\nPrior to Zilliz, Frank co-founded a indoor localization startup based in Shanghai and worked as a deep learning engineer at Yahoo in San Francisco. Frank holds a MS and BS degrees in Electrical Engineering from Stanford University.", "public_name": "Frank Liu", "guid": "9133ae7a-d7a8-56f7-b458-7a93c5e18262", "url": "https://pretalx.com/bbuzz22/speaker/HLP7KS/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/HZQPUF/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/HZQPUF/", "attachments": []}, {"guid": "1a83801d-87e0-5a91-9b3f-da266e339d14", "code": "T7SBSJ", "id": 16190, "logo": "https://pretalx.com/media/bbuzz22/submissions/T7SBSJ/47._Soman_Chinmay_EBda6zD.png", "date": "2022-06-14T16:50:00+02:00", "start": "16:50", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz22-16190-next-generation-olap-stack-using-apache-pinot", "url": "https://pretalx.com/bbuzz22/talk/T7SBSJ/", "title": "Next generation OLAP stack using Apache Pinot", "subtitle": "", "track": "Store", "type": "Talk", "language": "en", "abstract": "Real-time analytics has transformed the way companies do business. It has unlocked the ability to make real-time decisions such as customer incentives, business metrics, fraud detection and provide a personalized user experience that accelerates growth and user retention. This is a complex problem and naturally, there are several OLAP (OnLine Analytics Processing) solutions out there, each focusing on a different aspect.\n\nIn order to support all such use cases, we need an ideal OLAP platform that has the ability to support extremely high query throughput with low latency and at the same time provide high query accuracy \u2013 in the presence of data duplication and real-time updates. In addition, the same system must be able to ingest data from all kinds of data sources, handle unstructured data and real-time upserts. While there are different ways of solving each such problem scenario, ideally we want one unified platform that can be easily customized. In this talk, we will go over the rich capabilities of Apache Pinot that make it an ideal OLAP platform.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "RD8QUH", "name": "Chinmay Soman", "avatar": "https://pretalx.com/media/avatars/RD8QUH_GTQVvF3.webp", "biography": "Chinmay Soman is a founding engineer in StarTree, building real time analytics solutions at scale. Previously he led the streaming platform team at Uber for building a large scale, self-serve platform around messaging, stream processing and OLAP technologies. Before that, he worked at LinkedIn and IBM, focussing on distributed systems and security. He\u2019s a PMC member of Apache Samza and a committer on Apache Pinot, Voldemort, uReplicator and AthenaX.", "public_name": "Chinmay Soman", "guid": "39cb24fe-1a5e-5397-9edb-b742b5e0b6f9", "url": "https://pretalx.com/bbuzz22/speaker/RD8QUH/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/T7SBSJ/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/T7SBSJ/", "attachments": []}], "Maschinenhaus": [{"guid": "f28a67a3-aac1-570f-a598-c6fd8ac36acf", "code": "CY7LTV", "id": 19824, "logo": "https://pretalx.com/media/bbuzz22/submissions/CY7LTV/65._Henkle_Charlotte__-_Neumann_Sean_m9k2o0b.png", "date": "2022-06-14T10:10:00+02:00", "start": "10:10", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz22-19824-working-in-the-open-search", "url": "https://pretalx.com/bbuzz22/talk/CY7LTV/", "title": "Working in the Open...Search", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "In July of 2021, AWS launched the OpenSearch Project, an Apache 2.0 licensed fork derived from Elasticsearch 7.10.2 & Kibana 7.10.2. The OpenSearch Project is a community-driven, open source search and analytics suite. It consists of a search engine daemon, OpenSearch, and a visualization and user interface, OpenSearch Dashboards. OpenSearch enables people to ingest, secure, search, aggregate, view, and analyze data. Our goal is to build great software together with a strong and vibrant community. In this talk we\u2019ll cover what we\u2019ve launched so far, what\u2019s coming in the future, and the challenges of stewarding an open-source project while also being associated with a large corporation.\n\nThis talk is sponsored by OpenSearch", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "WE7EJP", "name": "Charlotte Henkle", "avatar": "https://pretalx.com/media/avatars/WE7EJP_1uC6NPh.webp", "biography": "Charlotte Henkle is a Senior Software Development Manager at the OpenSearch Project supporting the OpenSearch engine, Engineering Efficiency and Clients teams.  She\u2019s worked for Amazon for 14 years, working in Amazon Retail Catalog, Amazon Photos and Amazon Explore.  Prior to Amazon she wrote software for federal clients like the Marine Corps and the FBI and worked for the University of Chicago Biological Learning Center.", "public_name": "Charlotte Henkle", "guid": "adc8a31c-dc88-562c-bd25-eee2322a8d5b", "url": "https://pretalx.com/bbuzz22/speaker/WE7EJP/"}, {"code": "8VARWA", "name": "Sean Neumann", "avatar": "https://pretalx.com/media/avatars/8VARWA_K3apbAt.webp", "biography": "Sean Neumann is a Seattle-based software engineering manager leading development of OpenSearch Dashboards, a community-driven, visualization and analytics application in the OpenSearch Project. Over the last 9 years at Amazon, Sean has worked on Amazon OpenSearch Service, FinTech business applications, and the eCommerce Platform. Prior to Amazon, Sean was a software developer at Microsoft and various Seattle startups. Outside of work, Sean regularly practices yoga, where he has also been teaching for over six years.", "public_name": "Sean Neumann", "guid": "a8532395-adbe-58ab-9008-b4786b355a6b", "url": "https://pretalx.com/bbuzz22/speaker/8VARWA/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/CY7LTV/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/CY7LTV/", "attachments": []}, {"guid": "6668676b-4e03-5774-bb5a-0870f7f1ef09", "code": "S9EVEB", "id": 16128, "logo": "https://pretalx.com/media/bbuzz22/submissions/S9EVEB/55._Davies_Charlie_qCU7nf0.png", "date": "2022-06-14T10:40:00+02:00", "start": "10:40", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16128-should-we-stop-using-distance-in-our-location-based-data-recommendation-models", "url": "https://pretalx.com/bbuzz22/talk/S9EVEB/", "title": "Should we stop using distance in our location-based data recommendation models?", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Location is an important decision-making factor for many end users.  Hotel aggregators, job search portals, property listing companies all filter out results that are too far away. If the results page shows locations that are hard to reach, conversion rates will plummet. \n\nIf you\u2019re quality scoring results based on straight-line distance, you\u2019re not personalising your results page as well as you could be. That\u2019s because we never truly travel in a straight line, instead we\u2019re at the mercy of the transport networks around us. Distance never considers the context of accessibility, which is unique to every location around the world. \n\nUsing distance is impacting search result ranking because: \n1.\tIt doesn\u2019t acknowledge that long distances in quiet rural areas are easier to travel vs. congested urban areas \n2.\tIt ignores that some locations are situated on fast transport routes \u2013 they could appear far away but they may be really easy to access depending on the local infrastructure  \n3.\tLocal geography can massively impact accessibility \u2013 mountains, rivers and beaches all provide accessibility challenges  \n\nThe solution: \nUsing real world examples I\u2019ll discuss how to integrate travel times into your recommendation model and what the effects are for businesses and end users. I\u2019ll also discuss how the presence of transport data on search result listings helps reduce cognitive load when users are making a decision.  \n\nI\u2019ll end with a quick demo showing how to build it into your recommendation engine.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "TKLGFU", "name": "Charlie Davies", "avatar": "https://pretalx.com/media/avatars/TKLGFU_kjWUke5.webp", "biography": "Charlie Davies is the CEO of TravelTime and the creator of an API and set of plugins which enable users to search location data using minutes rather than miles. Charlie is responsible for running all aspects of TravelTime including planning the product roadmap and the coordination of the technical team.", "public_name": "Charlie Davies", "guid": "9fc826e8-874c-51fa-8771-a2c9e5351f8e", "url": "https://pretalx.com/bbuzz22/speaker/TKLGFU/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/S9EVEB/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/S9EVEB/", "attachments": []}, {"guid": "215eb312-a2b0-591c-a2da-1c4c3131f437", "code": "988SHD", "id": 15209, "logo": "https://pretalx.com/media/bbuzz22/submissions/988SHD/37._Tisiot_Francesco_DvtY1es.png", "date": "2022-06-14T11:30:00+02:00", "start": "11:30", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-15209-solving-the-knapsack-problem-with-recursive-queries-and-postgresql", "url": "https://pretalx.com/bbuzz22/talk/988SHD/", "title": "Solving the knapsack problem with recursive queries and PostgreSQL", "subtitle": "", "track": "Store", "type": "Talk", "language": "en", "abstract": "Optimization problems are everywhere, from deciding which clothes to pack in our luggage (aka the knapsack problem), to selecting the tasks that will be worked during a sprint. Trying to solve these type of problems by hand is a tedious task often resulting in sub-optimal decisions.\n\nIn this talk, we'll understand how PostgreSQL recursive queries can help. Starting from the proper problem definition, we'll then explore how to build queries that call themselves recursively, what are the risks associated with this approach and safeguards we can set to optimise performances. Finally we'll demonstrate how two new features released in PostgreSQL 14 enable an easier handling of the recursive statements.\n\nIf you're into PostgreSQL and eager to understand how recursion works, this session is for you!", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "QTSPFX", "name": "Francesco Tisiot", "avatar": "https://pretalx.com/media/avatars/QTSPFX_nLsFnN4.webp", "biography": "Francesco comes from Verona, Italy and works as a Developer Advocate at Aiven. With his many years of experience as a data engineer, he has stories to tell and advice for data-wranglers everywhere. Francesco loves sharing knowledge with others as a speaker and writer, and is on a mission to defend the world from bad Italian food!", "public_name": "Francesco Tisiot", "guid": "817da5c4-2bb3-56be-8e9b-a51687d77abc", "url": "https://pretalx.com/bbuzz22/speaker/QTSPFX/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/988SHD/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/988SHD/", "attachments": []}, {"guid": "146462dc-f8b9-5ec2-a6e3-2d74b6926778", "code": "CX997Y", "id": 15639, "logo": "https://pretalx.com/media/bbuzz22/submissions/CX997Y/40._Garcia_Sanchez_Ana_Maria_xOm93CS.png", "date": "2022-06-14T12:20:00+02:00", "start": "12:20", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-15639-relevance-is-not-a-thing-but-a-perception", "url": "https://pretalx.com/bbuzz22/talk/CX997Y/", "title": "Relevance is not a Thing but a Perception", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "When talking about relevance regarding search, it often sounds like it is a thing, something that can be touched and seen. Nevertheless, that is not the case. What do I mean by that? In this talk, I will provide some examples of how relevance is often merely seen as a score when it can be, in fact, an engaging relationship where the user and the search UI connect in aesthetic and enjoyable ways. I will present numerous examples of innovative search experiences that challenge prevailing schemas and structures and lead instead to elements of motion and correlated visual action that allows us to perceive the beauty of relevancy on a different level. Because relevance is a matter of perception\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "99MZSB", "name": "Ana Maria Garc\u00eda S\u00e1nchez", "avatar": "https://pretalx.com/media/avatars/99MZSB_Pk3PNzd.webp", "biography": "Hi! My name's Ana and I'm a 22 years old girl finishing her degree in Software Engineering while working at Empathy.co as a Backend Engineer. \n\nDuring my time in college, I've been active in student representation. In fact, I've been the president of the delegation and assembly of my school for two years, leading groups, making decisions, giving speeches, organizing events like the EII Tech Fest... \n\nAt this time, I'm trying to leave that world, but while I do, I'm the coordinator of the infrastructure commission at RITSI.  \n\nNot only I'm not afraid of talking in public and being \"against\" an audience, but I enjoy doing it, so trying to give a speech at Berlin Buzzwords is an exciting new experience.", "public_name": "Ana Maria Garc\u00eda S\u00e1nchez", "guid": "cc9e7dc0-2dea-5025-b915-12d9022882e3", "url": "https://pretalx.com/bbuzz22/speaker/99MZSB/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/CX997Y/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/CX997Y/", "attachments": []}, {"guid": "a5619d9b-46ba-56bc-b6cd-8c39c7574fd8", "code": "ZE3AJQ", "id": 15869, "logo": "https://pretalx.com/media/bbuzz22/submissions/ZE3AJQ/44._Burch_Nick_yS3BCF0.png", "date": "2022-06-14T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-15869-learning-about-ai-ml-for-text-with-wordle", "url": "https://pretalx.com/bbuzz22/talk/ZE3AJQ/", "title": "Learning about AI/ML for Text, with Wordle!", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "What can the hit game Wordle teach us about Information Retrieval, Search and AI/ML? As it turns out, quite a bit!\n\nWe'll use the Wordle game as our example \"text problem\" we want to solve, and run through many of the key concepts you need to get started  with AI and ML for text. We'll see (with code!) how some common text-related statistics work, and how they can be used to solve (cheat...) Wordle. Then, we'll build ourselves an AI to do the same. Finally, we'll see how that compares to brute-forcing it with regular expressions!\n\nWe won't solve all your text-related problems, but hopefully you'll learn the key concepts you need for more advanced talks. And if nothing else, you'll understand the python code for an AI to help you win Wordle!\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "97HYST", "name": "Nick Burch", "avatar": "https://pretalx.com/media/avatars/97HYST_759PqjE.webp", "biography": "Nick is heavily involved in a number of Apache projects, such as Tika and POI, while having the fortune to know many of the people involved in the Apache Big Data and Search space! When not helping out with Apache things, Nick works as the Director of Engineering at FLEC, where he leads a team making heavy use of Open Source technologies. When not helping improve the logistics industry, he is often to be found attending or organising BarCamps, Geek Nights, or other such fun events dedicated to sharing what's great and new!", "public_name": "Nick Burch", "guid": "e7f4faed-180f-58b5-8048-70be86cb80f6", "url": "https://pretalx.com/bbuzz22/speaker/97HYST/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/ZE3AJQ/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/ZE3AJQ/", "attachments": []}, {"guid": "c168edd4-2d58-5d76-9a51-9f792adbd68d", "code": "E9AJJE", "id": 16065, "logo": "https://pretalx.com/media/bbuzz22/submissions/E9AJJE/62._Shukla_Sakshi_kUFgcYL.png", "date": "2022-06-14T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16065-unpaired-sentiment-to-sentiment-translation-a-cycled-reinforcement-learning-approach", "url": "https://pretalx.com/bbuzz22/talk/E9AJJE/", "title": "Unpaired Sentiment-to-Sentiment Translation: A Cycled Reinforcement Learning Approach", "subtitle": "", "track": "Stream", "type": "Talk", "language": "en", "abstract": "Sentiment-to-Sentiment translation is a special case for Style Transfer. Style Transfer is emphasised on generating the opposite polar style in terms of emotions or sentiment. This results in the transfer of style successfully but loses the semantic context of the sentence. This is caused due to inefficient amount of data having these relevant paired sentences with polar styles. This talk focuses on generating unpaired dataset which preserves the semantic context during a style change using cycled Reinforcement Learning approach on parallel data having emotionalization and neutralization modules. \n\nThe talk can be viewed from https://bit.ly/bbuzz2022", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "DULSC8", "name": "Sakshi Deo Shukla", "avatar": "https://pretalx.com/media/avatars/DULSC8_MlG6hR0.webp", "biography": "I am currently pursuing my master in Computational Linguistics at the University of Stuttgart.  I am working as a Research Assistant at Landes Baden W\u00fcttemberg, ISTE. I have formerly worked as a Senior data scientist at Delhivery, India. I have been leading various technical communities in Delhi like Women Techmakers, WiMLDS, GDG Cloud New Delhi.", "public_name": "Sakshi Deo Shukla", "guid": "5555700a-ad05-5558-8600-c232a581b57d", "url": "https://pretalx.com/bbuzz22/speaker/DULSC8/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/E9AJJE/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/E9AJJE/", "attachments": [{"title": "https://bit.ly/bbuzz2022", "url": "/media/bbuzz22/submissions/E9AJJE/resources/Unpaired_Sentiment-to-Sentiment_Translation_A_Cycle_EZHCXfX.pdf", "type": "related"}]}, {"guid": "47b3a3b0-cd70-526d-b171-5d7b84a1c5e1", "code": "JNJTHF", "id": 16167, "logo": "https://pretalx.com/media/bbuzz22/submissions/JNJTHF/51._Erlandson_Erik_GSYxnlc.png", "date": "2022-06-14T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz22-16167-scaling-the-open-source-climate-community", "url": "https://pretalx.com/bbuzz22/talk/JNJTHF/", "title": "Scaling the Open Source Climate Community", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "The scarcity of standardized and accessible data at the convergence of human climate impacts and the financial sector prevents economic stakeholders from effectively aligning world-wide investment and capital flows with Environmental, Social, and Governance (ESG) objectives. The majority of financial companies cannot afford costly bespoke ingestion and curation projects, and so climate-aware investing remains limited without the benefit of shared data or open protocols.\n\nAt the Open Source Climate (OS-Climate) community, we are building an open data science platform that supports data ingestion, processing and quality management for data from both corporate climate reports and investment related data. In order for this global community project to succeed, OS-Climate must implement traditional scalability of compute and data, but that alone is insufficient. The community must also scale the operation of its cluster and software deployments. Furthermore, it must effectively scale its ability to onboard new data workflows from actively contributing members. Last but not least, it must be able to scale its own governance at each of these levels, as they mature.\n\nIn this talk, Erik will introduce OS-Climate and tell the story of how this open community has managed its own evolution to continue scaling data, computation, operations, member contributions and governance. The audience will learn about tools from software, data science, platforms, and community architecture that can help their own communities grow.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "ED8CCY", "name": "Erik Erlandson", "avatar": "https://pretalx.com/media/avatars/ED8CCY_EXy4XxQ.webp", "biography": "Erik Erlandson is a Software Engineer at Red Hat\u2019s Open Services Group, where he explores emerging technologies at the intersection of Data Science workloads and the Kubernetes ecosystem.", "public_name": "Erik Erlandson", "guid": "56f20031-74c0-5578-9264-8acd7c75b5ce", "url": "https://pretalx.com/bbuzz22/speaker/ED8CCY/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/JNJTHF/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/JNJTHF/", "attachments": []}, {"guid": "52c9740c-df18-5f71-b7de-75b217f3b193", "code": "ZUNELS", "id": 15433, "logo": "https://pretalx.com/media/bbuzz22/submissions/ZUNELS/54._Dangat_Umesh_h2zBvw4.png", "date": "2022-06-14T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz22-15433-nrtsearch-yelp-s-fast-scalable-and-cost-effective-open-source-search-engine", "url": "https://pretalx.com/bbuzz22/talk/ZUNELS/", "title": "NrtSearch: Yelp\u2019s fast, scalable, and cost-effective open source search engine", "subtitle": "", "track": "Search", "type": "Short Talk", "language": "en", "abstract": "Search and ranking are part of many important features on the Yelp platform - from looking for a plumber to showing relevant photos of the dish you search for. These varied use cases led to the creation of Yelp\u2019s Elasticsearch-based ranking platform which we presented at Berlin Buzzwords 2019, allowing real-time indexing, learning-to-rank, and lesser maintenance overhead, as well as enabling access to search functionality to more teams at Yelp. We recently built Nrtsearch, a Lucene-based search engine, to replace Elasticsearch. We have open sourced this search engine under the Apache 2.0 license.\n\nThis talk will detail\n\nChallenges associated with scaling Elasticsearch costs and performance. \nMainly issues related to the document-based replication approach.\nDifficulties with real time auto scaling of Elasticsearch.\nInefficient usage of resources due to hot and cold node issues.\n\nArchitecture of Nrtsearch\nUses Lucene\u2019s near-real-time (NRT) segment replication\nPrimary-Replica architecture: Primary does all writing including segment merges while replicas simply copy over segments using Lucene's NRT APIs and serve search queries. \nCluster orchestration, availability and management of nodes is left to systems like Kubernetes that excel at resource management and scheduling.\nTruly stateless architecture: Deployed as a standard microservice using Kubernetes. State is committed to s3, upon a restart of a primary or replica, the most recent state from s3 is pulled down.\n\nBenefits of this architecture\nPerformance increased by up to 50%\nCluster costs lowered by up to 50%\nUse of standard tools (k8s) to manage operational aspects of the cluster, relieving ranking infrastructure teams to focus on search-related problems.\n\nChallenges involved in rolling this out to production\nLucene\u2019s segment replication approach and  the code itself is not widely used in the industry so had some rough edges. Exciting performance bugs!  \n\nFuture work\nEnhance feature support via extensible plugins like vector-embeddings\nContinue to simplify and open source deployment tooling to help others deploy NrtSearch in their own cloud environments.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "ZXGUDV", "name": "Umesh Dangat", "avatar": "https://pretalx.com/media/avatars/ZXGUDV_n79gE9m.webp", "biography": "Umesh Dangat is a Principal Engineer and Group Tech Lead for the market engineering platform at Yelp. Umesh joined Yelp in 2015 and has since architected and led Yelp\u2019s ranking infrastructure evolution into its third generation. This group at Yelp is responsible for providing search and ranking infrastructure to all of Yelp\u2019s search and ranking needs in a cost efficient, scalable and extensible way.\n\nPrior to Yelp, Umesh has worked at various companies for over a decade mostly solving search, streaming and data ingestion problems for large datasets and building backend systems.\n\nUmesh is also an open source contributor for popular search projects like Elasticsearch, learning to rank and most recently Nrtsearch.", "public_name": "Umesh Dangat", "guid": "0bc1a4ec-8a13-58ca-b238-be138323f40d", "url": "https://pretalx.com/bbuzz22/speaker/ZXGUDV/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/ZUNELS/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/ZUNELS/", "attachments": []}], "Frannz Salon": [{"guid": "972eb9d8-ce31-511f-aa3d-11330d911e9e", "code": "GAGCJ3", "id": 15729, "logo": "https://pretalx.com/media/bbuzz22/submissions/GAGCJ3/56._Dagdelen_John_dYFExUZ.png", "date": "2022-06-14T10:10:00+02:00", "start": "10:10", "duration": "00:20", "room": "Frannz Salon", "slug": "bbuzz22-15729-matscholar-the-search-engine-for-materials-science-researchers", "url": "https://pretalx.com/bbuzz22/talk/GAGCJ3/", "title": "Matscholar: The search engine for materials science researchers", "subtitle": "", "track": "Search", "type": "Short Talk", "language": "en", "abstract": "Matscholar (Matscholar.com) is a scientific knowledge search engine for materials science researchers. We have indexed information about materials, their properties, and the applications they are used in for millions of materials by text mining the abstracts of more than 5 million materials science research papers. Using a combination of traditional and AI-based search technologies, our system extracts the key pieces of information and makes it possible for researchers to do queries that were previously impossible.  Matscholar, which utilizes Vespa.ai and our our own bespoke language models, greatly accelerates the speed at which energy and climate tech researchers can make breakthroughs and can even help them discover insights about materials and their properties that have gone unnoticed.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "RZBXSL", "name": "John Dagdelen", "avatar": "https://pretalx.com/media/avatars/RZBXSL_MMDsjRa.webp", "biography": "John Dagdelen is a PhD student in the department of materials science and engineering at UC Berkeley. His research focuses at the intersection of artificial intelligence, high performance computing, and materials discovery and design.", "public_name": "John Dagdelen", "guid": "3db29e2f-364c-5c1e-b561-9af9f2726658", "url": "https://pretalx.com/bbuzz22/speaker/RZBXSL/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/GAGCJ3/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/GAGCJ3/", "attachments": []}, {"guid": "e60d9f1f-cfbd-531c-af4b-013b2bba6d9d", "code": "GPQE8T", "id": 16156, "logo": "https://pretalx.com/media/bbuzz22/submissions/GPQE8T/34._Reed_Josh_PZeOiaa.png", "date": "2022-06-14T10:40:00+02:00", "start": "10:40", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16156-effective-ci-cd-for-large-systems", "url": "https://pretalx.com/bbuzz22/talk/GPQE8T/", "title": "Effective CI/CD for Large Systems", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "CI/CD brings tremendous value to development teams. The rapid availability of feedback helps developers make informed decisions about their design choices and lets teams deploy with confidence. But when systems become large and test times go from seconds to hours, how do we get our groove back? In this talk, we\u2019ll explore strategies for validating large, complex systems, such as:\n\n- Setting well-defined component boundaries\n- Flexibly modeling dependencies between these components\n- Ranking tests by cost versus value\n- Testing in production with canary launches and feature flags\n\nThese and similar techniques let us minimize test times, maximize confidence, and free our teams up to focus on delivering value to customers.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "QQCNNH", "name": "Josh Reed", "avatar": "https://pretalx.com/media/avatars/QQCNNH_OpT32WI.webp", "biography": "Josh lives in Montr\u00e9al, Canada, and works on the Release Engineering team at [Aiven](https://aiven.io/). With many years of experience in several parts of the software development lifecycle, he has a passion for processes that enable smooth interactions between developers and operations. When he\u2019s not hacking away at a problem or championing best practices, he loves to cook recipes from around the world, as long as he can garnish them with a little bit of cilantro!", "public_name": "Josh Reed", "guid": "c2d80b1d-6514-5514-a8ad-64da177f48f7", "url": "https://pretalx.com/bbuzz22/speaker/QQCNNH/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/GPQE8T/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/GPQE8T/", "attachments": []}, {"guid": "4ee69693-702f-587a-bf0a-9d0562f50191", "code": "3HHYQM", "id": 16058, "logo": "https://pretalx.com/media/bbuzz22/submissions/3HHYQM/38._Bibi_Eran_JYPmNhu.png", "date": "2022-06-14T11:30:00+02:00", "start": "11:30", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16058-don-t-panic-getting-your-infrastructure-drift-under-control", "url": "https://pretalx.com/bbuzz22/talk/3HHYQM/", "title": "Don't Panic: Getting Your Infrastructure Drift Under Control", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "In your ever-changing Infrastructure, some changes are intentional while others are not. \n\nInfrastructure Drift can happen for many reasons, sometimes it happens when adding or removing resources, other times when changing resource definitions upon resource termination or failure, and even when changes have been made manually or via other automation tools.\n\nWhen something is changed intentionally, it will appear in the source code, and should not raise any alarm. However, if any part of the infrastructure has been changed manually, there are tools that can identify this, and alert to the change. In other words, if your IaC drifted from its expected state, then you can in fact, detect it.\n\nApplying simple solutions can empower  DevOps and developer velocity, with the reassurance and context for unexpected changes in your IaC, in near real-time. This talk will showcase real-world examples, and practical ways to apply this in your production environments, while doing so safely and at the pace of your engineering cycles.\n\nDrift is what happens whenever the real-world state of your infrastructure differs from the state defined in your configuration.", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "EUZ9MS", "name": "Eran Bibi", "avatar": "https://pretalx.com/media/avatars/EUZ9MS_pOGGHll.webp", "biography": "Eran Bibi is Co-Founder & Chief Product Officer at Firefly. With years of experience in anything DevOps/SRE and security, he has earned a reputation as a CI/CD and SRE expert and an avid admin of Cloud Platforms and containerized environments.\n\nPrior to Firefly, Eran was Head of DevOps & Cloud Platform at Aqua Security and DevOps Group Lead at Finastra. Eran is a frequent speaker at Cloud Native meetups, AWS community meetups, and other cloud workshops and conferences.", "public_name": "Eran Bibi", "guid": "e29009e6-dd45-5da1-9e29-9dfb5b5aaf2e", "url": "https://pretalx.com/bbuzz22/speaker/EUZ9MS/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/3HHYQM/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/3HHYQM/", "attachments": []}, {"guid": "7055c783-8928-5d48-821e-cbd7eb8edb25", "code": "X8CYRY", "id": 16144, "logo": "https://pretalx.com/media/bbuzz22/submissions/X8CYRY/41._Fricke_Thomas_fZZLCNO.png", "date": "2022-06-14T12:20:00+02:00", "start": "12:20", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16144-optimizing-containers-for-security-and-scaling", "url": "https://pretalx.com/bbuzz22/talk/X8CYRY/", "title": "Optimizing Containers for Security and Scaling", "subtitle": "", "track": "Scale", "type": "Talk", "language": "en", "abstract": "This talk is about creating minimal containers. The author has started to dive into Kubernetes and Container Security some years ago. Minimizing the size and the attack vectors are just two sides of the same coin. As a reward, you get much faster deployment pipelines, enabling more automated testing and higher scalability. A speed up by a factor of 10 or 20 is not unusual, sometimes the size of a cointainer shrinks by a factor of 100.\n\n- 12factor IX: disposability\n- bad examples\n- optimizing the size of a container\n - building minimal containers from scratch\n - a small step in a Dockerfile, a big leap for container size \n- debugging minimal containers\n- speed up\n- security measured by Trivy", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "M3HVSR", "name": "Thomas Fricke", "avatar": "https://pretalx.com/media/avatars/M3HVSR_3ltTmd6.webp", "biography": "Kubernetes Security in Critical Infrastructure\nFounder of Resility, Endocode, In\u00f6g \n\nOpen Source Enthusiast", "public_name": "Thomas Fricke", "guid": "c304b0a9-9bbf-58cc-a48b-d99a425885d9", "url": "https://pretalx.com/bbuzz22/speaker/M3HVSR/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/X8CYRY/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/X8CYRY/", "attachments": []}, {"guid": "f225d632-6098-5702-ae2d-e684193e86c3", "code": "QJZPYZ", "id": 14713, "logo": "https://pretalx.com/media/bbuzz22/submissions/QJZPYZ/Gheorge_Radu_-_Hacman_Ciprian_8rSiGtW.png", "date": "2022-06-14T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-14713-autoscaling-elasticsearch-for-logs-on-kubernetes", "url": "https://pretalx.com/bbuzz22/talk/QJZPYZ/", "title": "Autoscaling Elasticsearch for Logs on Kubernetes", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Elasticsearch (or OpenSearch) clusters likely need to scale to adapt to changes in load. But autoscaling Elasticsearch isn't trivial: indices and shards need to be well sized and well balanced across nodes. Otherwise the cluster will have hotspots and scaling it further will be less and less efficient.\n\nThis talk focuses on two aspects:\n- best practices around scaling Elasticsearch for logs and other time-series data\n- how to apply them when deploying Elasticsearch on Kubernetes. In the process, a new (open-source) operator will be introduced (yes, there will be a demo!). This operator will autoscale Elasticsearch while keeping a good balance of load. It does so by changing the number of shards in the index template and rotating indices when the number of nodes changes.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "3CMEKA", "name": "Radu Gheorghe", "avatar": "https://pretalx.com/media/avatars/3CMEKA_DwlKQxc.webp", "biography": "Radu Gheorghe works mainly as a [search consultant](https://sematext.com/consulting) at Sematext, working with clients of all sizes on their Elasticsearch and Solr projects. He is also a [trainer](https://sematext.com/training/) and does [production support](https://sematext.com/support/) for both these search engines.\n\nSometimes he helps out with the development of Sematext Cloud (an observability SaaS), mostly when it comes to Elasticsearch and log shippers (e.g. Logstash, rsyslog...). He also writes on the [Sematext blog](https://sematext.com/blog/author/radu7gheorghe/page/2/) or helps other publish new articles.\n\nHe co-authored a book (Elasticsearch in Action, Manning), recorded a video tutorial (Working with Elasticsearch, O'Reilly) and was a speaker at a [number of conferences](https://www.youtube.com/watch?v=kKocQdYGVJM&list=PLjwv6_Ik6hnLEmz-rcII0cGyAIToRF4Q6), such as Berlin Buzzwords, LuceneSolrRevolution (later Activate) and O'Reilly Velocity.", "public_name": "Radu Gheorghe", "guid": "50415125-46a1-53ec-bc30-5230c9029fba", "url": "https://pretalx.com/bbuzz22/speaker/3CMEKA/"}, {"code": "SWMRJ8", "name": "Ciprian Hacman", "avatar": "https://pretalx.com/media/avatars/SWMRJ8_uS8qwW4.webp", "biography": "Ciprian Hacman works mainly as a DevOps/Software Engineer for polypoly, helping them and other clients modernize their infrastructure and migrate to Kubernetes.\nHe is also an open source project maintainer for kOps (Kubernetes Operations), etcd-manager, cloud-provider-aws and frequent contributor to other projects in the Kubernetes ecosystem.", "public_name": "Ciprian Hacman", "guid": "74424837-2a41-5687-8424-6bb6a215f3f8", "url": "https://pretalx.com/bbuzz22/speaker/SWMRJ8/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/QJZPYZ/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/QJZPYZ/", "attachments": []}, {"guid": "c4e4d9ff-0433-59c2-a0d1-9032dbf5be77", "code": "37DVBP", "id": 16162, "logo": "https://pretalx.com/media/bbuzz22/submissions/37DVBP/49._Davies_Lily_-_Perifanos_Konstantinos_q1p2LRt.png", "date": "2022-06-14T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16162-dense-concept-retrieval", "url": "https://pretalx.com/bbuzz22/talk/37DVBP/", "title": "Dense Concept Retrieval", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "At codec.ai we are processing at a daily basis a large volume of input streams in different modalities: text, image, videos. Understanding and making sense of what this content is from a cultural point of view is a challenging task. Here, we will be presenting our multimodal search engine which makes possible to search text, image and video content. \n\n\nWe will be discussing traditional information retrieval approaches augmented with dense retrieval representations produced by neural networks (embeddings), dot product queries with Elasticsearch and approximate nearest neighbor techniques such as Locality-Sensitive Hashing (LSH) an Product Quantization (PQ).\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "Y9WUTK", "name": "Konstantinos Perifanos", "avatar": "https://pretalx.com/media/avatars/Y9WUTK_b101WxO.webp", "biography": "Kostas is the Head of Data Science at codec.ai, leading the strategy and the implementation of machine learning, natural language processing\nand information retrieval across the business. He enjoys high quality coffee, hiking and landscape photography.\nHe holds a PhD in Natural Language Processing", "public_name": "Konstantinos Perifanos", "guid": "f9a6bd92-eb74-5826-8325-6f4fcc881e30", "url": "https://pretalx.com/bbuzz22/speaker/Y9WUTK/"}, {"code": "YQQJ9A", "name": "Lily Davies", "avatar": "https://pretalx.com/media/avatars/YQQJ9A_SBiIpRj.webp", "biography": "Lily is a data scientist at Codec.ai, working with NLP, deep learning and IR to understand culture across different modalities.", "public_name": "Lily Davies", "guid": "7e0b5a83-6623-5509-b494-7d7c6e9b4bea", "url": "https://pretalx.com/bbuzz22/speaker/YQQJ9A/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/37DVBP/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/37DVBP/", "attachments": []}, {"guid": "39b8db50-f841-5db6-988f-708253b48488", "code": "YFLCEZ", "id": 16148, "logo": "https://pretalx.com/media/bbuzz22/submissions/YFLCEZ/52._Kan_Dmitry_-_Talman_Aarne_xl0j2fx.png", "date": "2022-06-14T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz22-16148-muves-multimodal-and-multilingual-vector-search-with-hardware-acceleration", "url": "https://pretalx.com/bbuzz22/talk/YFLCEZ/", "title": "Muves: Multimodal and multilingual vector search with Hardware Acceleration", "subtitle": "", "track": "Search", "type": "Talk", "language": "en", "abstract": "Bringing multimodal experience into search journey became of high interest lately: searching images with text, or looking inside an audio file, combining that with the rgb frames of a video stream. Today, vector search algorithms (like FAISS, HNSW, BuddyPQ) and databases (Vespa, Weaviate, Milvus and others) make these experiences a reality. But what if you as a user would like to stay with the familiar Elasticsearch / OpenSearch AND leverage the vector search at scale? In this talk we will take a hardware acceleration route to build a vector search experience over products and will show how you can blend the worlds of neural search with symbolic filters. \n\nWe will discuss use cases where adding multimodal and multilingual vector search will improve recall and compare results from Elasticsearch/OpenSearch with and without the vector search component using tools like Quepid. We will also investigate different fine-tuning approaches and compare their impact on different quality metrics.\n\nWe will demonstrate our findings using our end-to-end search solution Muves which combines traditional symbolic search with multimodal and multilingual vector search and includes an integrated fine-tuner for easy domain adaptation of pre-trained vector models.\n\nThe Search track is presented by OpenSource Connections", "description": null, "recording_license": "", "do_not_record": false, "persons": [{"code": "PUPNAC", "name": "Aarne Talman", "avatar": "https://pretalx.com/media/avatars/PUPNAC_j8OLMvC.webp", "biography": "Aarne has more than 16 years of experience in software development, consulting and academic research with specific focus on NLP and search engines.\n\nAarne is the CEO and co-founder of Basement AI, Lead AI Engineer at Silo AI and a PhD researcher in NLP at University of Helsinki.\n\nAarne is currently working on a new multilingual and multimodal search engine Muves.", "public_name": "Aarne Talman", "guid": "0f65fa2d-c6d4-530a-8eb3-bed6fc33b8be", "url": "https://pretalx.com/bbuzz22/speaker/PUPNAC/"}, {"code": "UBSNXT", "name": "Dmitry Kan", "avatar": "https://pretalx.com/media/avatars/UBSNXT_bAfzJep.webp", "biography": "Dmitry has been focusing on search engines since 2010 with Apache Lucene and Solr and since 2020 with Elasticsearch. He was responsible for building a search team and search technology powering AlphaSense product which today is used by thousands of reputed banks, hedge funds and companies in almost any industry vertical around the world. At Silo.AI Dmitry led a team of NLP researchers, search, frontend and QA engineers working on search at web scale, interacting with Product Management, Engineers and Data teams on a daily basis.\nDmitry has worked on open source projects Luke and Quepid and co-founded a few startups: in text analytics, edtech and team engagement space. He is the founder and host of the Vector Podcast (https://www.youtube.com/c/VectorPodcast). Having established himself as an independent researcher in vector search, Dmitry began working on Muves -- multilingual and multimodal search engine, together with his co-founders. In free time he enjoys reading, cycling and blogging about AI and Search. Dmitry holds a PhD in Applied Mathematics and a Master\u2019s in Computer Science.", "public_name": "Dmitry Kan", "guid": "ab2662a6-0951-5480-8996-828314221f94", "url": "https://pretalx.com/bbuzz22/speaker/UBSNXT/"}], "links": [], "feedback_url": "https://pretalx.com/bbuzz22/talk/YFLCEZ/feedback/", "origin_url": "https://pretalx.com/bbuzz22/talk/YFLCEZ/", "attachments": [{"title": "Slides", "url": "/media/bbuzz22/submissions/YFLCEZ/resources/Berlin_Buzzwords__Muves_-_Multilingual_and_Multimod_QYebUuT.pdf", "type": "related"}]}]}}]}}}