<?xml version="1.0" encoding="UTF-8"?><rss xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:googleplay="http://www.google.com/schemas/play-podcasts/1.0"><channel><title><![CDATA[Data Science is not Rocket Science]]></title><description><![CDATA[Simple and effective solutions to real business problems]]></description><link>https://www.datascienceisnotrocketscience.com</link><image><url>https://substackcdn.com/image/fetch/$s_!SmAo!,w_256,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8549a974-57e4-4ee5-95d1-87cd9d20ae52_1024x1024.png</url><title>Data Science is not Rocket Science</title><link>https://www.datascienceisnotrocketscience.com</link></image><generator>Substack</generator><lastBuildDate>Thu, 16 Apr 2026 01:23:56 GMT</lastBuildDate><atom:link href="https://www.datascienceisnotrocketscience.com/feed" rel="self" type="application/rss+xml"/><copyright><![CDATA[Alan Krumholz Nudelstejer]]></copyright><language><![CDATA[en]]></language><webMaster><![CDATA[datascienceisnotrocketscience@substack.com]]></webMaster><itunes:owner><itunes:email><![CDATA[datascienceisnotrocketscience@substack.com]]></itunes:email><itunes:name><![CDATA[Alan Krumholz]]></itunes:name></itunes:owner><itunes:author><![CDATA[Alan Krumholz]]></itunes:author><googleplay:owner><![CDATA[datascienceisnotrocketscience@substack.com]]></googleplay:owner><googleplay:email><![CDATA[datascienceisnotrocketscience@substack.com]]></googleplay:email><googleplay:author><![CDATA[Alan Krumholz]]></googleplay:author><itunes:block><![CDATA[Yes]]></itunes:block><item><title><![CDATA[Importance of Model Calibration: Techniques & Benefits]]></title><description><![CDATA[Learn the importance of model calibration. Discover techniques like Platt scaling & Isotonic regression for aligning predicted scores with true probabilities.]]></description><link>https://www.datascienceisnotrocketscience.com/p/importance-of-model-calibration-techniques</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/importance-of-model-calibration-techniques</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Tue, 21 Mar 2023 23:41:42 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!d-Z1!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!d-Z1!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!d-Z1!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!d-Z1!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!d-Z1!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!d-Z1!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png" width="488" height="488" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:488,&quot;bytes&quot;:1386783,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!d-Z1!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!d-Z1!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!d-Z1!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!d-Z1!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F502fc7a7-42d7-4b42-8d69-32ae6dd2d731_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Model calibration is the process of ensuring that the predicted probabilities produced by a model align with the true underlying probabilities of the data. A well-calibrated model will output probabilities that accurately reflect the likelihood of a certain outcome. For example, if a model predicts a probability of 0.8 for a positive outcome, it should be the case that 80% of the time the outcome is actually positive.</p><p>A binary classification model can have <a href="https://en.wikipedia.org/wiki/Precision_and_recall">high precision and high recall</a>, but if the predicted probabilities produced by the model do not align with the true underlying probabilities of the data, it is not considered well calibrated. An example of this can be a model used for predicting credit default, which is utilized by financial institutions to determine the likelihood of a borrower defaulting on their loan. These models are usually trained on large datasets of historical credit data and various features such as income, credit history, employment status, and more.</p><p>However, if the model lacks proper calibration, it may not accurately predict the probability of default, despite being able to accurately separate positive from negative examples. For instance, the model may predict that a borrower has a 90% chance of defaulting, when in reality the probability is only 50%. This discrepancy can lead to poor decision-making, as the model's predictions will not be in line with the true probabilities, resulting in financial losses for the institution and adverse consequences for borrowers.</p><p>I recently <a href="https://www.datascienceisnotrocketscience.com/p/optimal-pricing">wrote about a price optimization model</a> that I had developed in the past. In that particular model, it was also critical to ensure we had calibrated scores, as any inaccuracies in the predicted probability of a sale would have led to flawed calculations of the expected revenue (sale probability * potential revenue = expected revenue), ultimately resulting in suboptimal pricing decisions.</p><p>To ensure the accuracy of the probabilities and sound decision making, it is often important to calibrate a model. Techniques such as <a href="https://en.wikipedia.org/wiki/Platt_scaling">Platt scaling</a> or <a href="https://en.wikipedia.org/wiki/Isotonic_regression">Isotonic regression</a> can be used to adjust the model's predictions to align with the true probabilities. Platt scaling is a technique that involves fitting a logistic regression model to the predicted scores of a binary classification model, while Isotonic Regression involves fitting a free-form line to the predicted scores. Both techniques aim to discover a transformation of the predicted scores that aligns them more closely with the true probabilities.</p><p>In conclusion, model score calibration is a crucial step in ensuring that the predictions made by a model align with the true underlying probabilities of the data. A well-calibrated model will generate probabilities that accurately depict the likelihood of a specific outcome. It is important to consider the usage of the model scores and implement calibration during the development and deployment stages of machine learning models when necessary.</p>]]></content:encoded></item><item><title><![CDATA[Accurately Forecasting Sales Months in Advance with ML]]></title><description><![CDATA[Learn how we utilized AutoML to accurately forecast clothing sales months in advance, resulting in millions of dollars in revenue and fewer leftovers.]]></description><link>https://www.datascienceisnotrocketscience.com/p/accurately-forecasting-sales-months</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/accurately-forecasting-sales-months</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Wed, 15 Mar 2023 13:41:21 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!LQ3-!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!LQ3-!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!LQ3-!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!LQ3-!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!LQ3-!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!LQ3-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png" width="486" height="486" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/bf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:486,&quot;bytes&quot;:1795468,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!LQ3-!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!LQ3-!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!LQ3-!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!LQ3-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf19b0b7-4b85-428f-a554-824f4a0021f8_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context</h4><p>We were engaged by a leading clothing company that operates hundreds of stores worldwide and is renowned for its affordable custom clothing designs. The company sought to accurately forecast the number of clothing items, in various colors, they would sell in advance of each season.</p><h4>Problem</h4><p>The company updates its clothing lines and colors every three months, making it crucial for them to accurately forecast sales in order to optimize revenue and avoid excess inventory. The company needed to forecast sales several months in advance to order the necessary fabric and produce clothing with their manufacturing partners before each season began.</p><h4>Objective</h4><p>To provide the company's planners with highly accurate quarterly sales forecasts for all items, in all colors, to enable optimal production decisions.</p><h4>Solution</h4><p>To address the challenge of accurately forecasting sales for the clothing company, we utilized an <a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">AutoML managed service</a> to develop a complex regression model. This model outperformed the more traditional forecasting methods that the company had previously employed. While the company only required quarterly forecasts, we found that training a more granular model and aggregating simulated numbers for the quarter resulted in more accurate predictions.</p><p>We utilized the company's historical sales data to train the model to make predictions for each day, store, item, and color. Our model included three types of features: date features (day of the week, day of the month, month of the year, holiday, etc.), store features (country, city, store size, etc.), and item features (type, subtype, color, standardized description, etc.). Using the new items in new colors, we then used the model to simulate sales on all stores for each day in a future season and aggregated the data to the appropriate granularity for the planners.</p><p>To ensure the model's real-life accuracy given the granularity of our training data, we were careful to prevent data leakage between the train, validation, and test datasets by splitting them by quarter. This ensured that the model was validated and tested on full quarters with new products/colors that it had never seen during training.</p><p>One of the biggest challenges was convincing the planners to trust the model's predictions, even when we had data to prove its accuracy and superiority. To address this, we implemented what-if simulations, using items from previous seasons to forecast sales for future seasons on these past items. This helped the planners identify which new items the model predicted would sell similarly to past items/colors, increasing their confidence in the model's predictions. Additionally, the model allows the planners to run simulations of new items on past seasons, providing further insights on how they would have performed.</p><h4>Impact</h4><p>The implementation of these new models had a significant impact on the company's operations. The planners began utilizing these models for their decision-making, resulting in millions of dollars in revenue and significantly fewer leftovers.</p>]]></content:encoded></item><item><title><![CDATA[The Next Frontier for AutoML: Learning Custom Embeddings]]></title><description><![CDATA[Learn how custom embeddings can transform machine learning for businesses. Explore the Two Tower model and Graph Neural Networks for creating potent embeddings.]]></description><link>https://www.datascienceisnotrocketscience.com/p/the-next-frontier-for-automl-learning</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/the-next-frontier-for-automl-learning</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Wed, 01 Mar 2023 11:09:00 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6cd059d3-7b37-4f04-ad48-4d35f3f6b48f_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!2i15!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!2i15!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!2i15!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!2i15!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!2i15!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!2i15!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png" width="444" height="444" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:444,&quot;bytes&quot;:1356114,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!2i15!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!2i15!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!2i15!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!2i15!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2baebe9e-09f8-470b-afb2-c9058926db2a_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>As an AI enthusiast, I'm always excited to explore the latest developments in machine learning. Today, I want to share my thoughts on what I believe is the next frontier for AutoML - custom embeddings.</p><p>In a <a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">previous blog post</a>, I talked about my preferred AutoML tool - vertex AI AutoML. While it's a great tool to build ML models, it won't let you learn embeddings from your data. I believe that the future of machine learning lies in the development of tools that allow users to learn custom <a href="https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture">embeddings</a> from their own data. These embeddings can represent different business entities and have the potential to transform the way we solve business problems.</p><p>So, what exactly are embeddings? In a nutshell, they create a mathematical representation of different business entities such as users, products, and stores, allowing computers to understand information about these entities and how they are related to each other. This is incredibly useful for businesses, as embeddings can be used to build ML models with very sparse training data and also improve the speed of model inference in high-volume real-time systems with very strict latency requirements.</p><p>In my opinion, the Two Tower model is a viable solution for addressing this issue. It is a powerful neural network architecture that can create high-quality embeddings of various business entities. The model embeds pairs of diverse entities, such as users and products, which are each represented by structured and unstructured data, into the same embedding space. This allows the embeddings to encode how different entities relate to each other. However, there is currently no robust AutoML-managed tool on the market to train a Two Tower Model, making it difficult for businesses to adopt this technique without substantial expertise and manual effort.</p><p>Fortunately, the field of AutoML is constantly evolving, and promising developments like Graph Neural Networks (GNNs) are also emerging. <a href="https://kumo.ai/">Kumo</a>, for example, is developing tools that model existing structured business data as graphs and create powerful ML models based on those graphs. Graph networks are ideal for learning potent embeddings for business entities in the data because graphs can easily encode complex relationships and dependencies between these entities. I am eager to see what the team comes up with in the form of new products.</p><p>In conclusion, the potential of learning custom embeddings using AutoML is vast and has the ability to transform the way businesses solve problems and innovate. Although there are currently no robust AutoML-managed tools available for creating custom embeddings, promising developments like the Two Tower model and Graph Neural Networks are emerging. As AutoML continues to evolve, we can expect to see new and more effective tools that will enable businesses to easily learn and utilize custom embeddings to their fullest potential.<br><br></p>]]></content:encoded></item><item><title><![CDATA[Computer Vision: Frictionless Mobile Document Digitization]]></title><description><![CDATA[Discover our solution to streamline the document digitization process. Learn how we utilized GCP AutoML vision and NLP to develop a scalable API for mobile app.]]></description><link>https://www.datascienceisnotrocketscience.com/p/frictionless-document-digitization</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/frictionless-document-digitization</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Wed, 15 Feb 2023 11:47:12 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!y_CU!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!y_CU!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!y_CU!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!y_CU!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!y_CU!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!y_CU!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png" width="484" height="484" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/b59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:484,&quot;bytes&quot;:1492186,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!y_CU!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!y_CU!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!y_CU!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!y_CU!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fb59eda2b-6191-4b9e-9146-925b461f5803_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context&nbsp;</h4><p>We were engaged by a client that specializes in digitizing documents. The client sought a solution to enable their mobile users to capture physical documents using their phone cameras and fill the documents electronically through their mobile application.</p><h4>Objective</h4><p>The objective was to develop a model that could accurately detect all inputs on a document, including the precise location, size, and type of the inputs (e.g. name boxes, date boxes, signature boxes).</p><h4>Solution</h4><p>To achieve this objective, we utilized several of Google Cloud Platform's <a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">AutoML managed services</a> and created a scalable python API (also utilizing a GCP managed service) to coordinate the calls to all of the managed services. The mobile application was able to call this custom API every time a user captured a document using their phone camera.</p><p>The custom python API received the document image as input and:</p><ul><li><p>Utilized the <a href="https://cloud.google.com/vision/docs/ocr">GCP Vision API</a>, which is trained by Google, to extract all text and text metadata from the document image (text metadata includes the precise coordinates of each word in the document image).</p></li><li><p>In parallel, utilized the GCP AutoML Image managed service where we had trained a custom <a href="https://cloud.google.com/vertex-ai/docs/training-overview#object_detection_for_images">object detection model</a> and deployed it as a service. This model detected all input boxes on the document image and returned their bounding boxes (coordinates).</p></li><li><p>Inserted a custom keyword for each detected input box into the document text (using the coordinates from both the Vision API and custom object detection model output to insert the keyword in the correct location).</p></li><li><p>Utilized a final custom AutoML Text model, trained to <a href="https://cloud.google.com/vertex-ai/docs/training-overview#entity_extraction_for_text">perform entity extraction</a>, to classify the input boxes into their respective types based on the surrounding text.</p></li></ul><p>For example:</p><ul><li><p>Input document image:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!hIcp!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!hIcp!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 424w, https://substackcdn.com/image/fetch/$s_!hIcp!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 848w, https://substackcdn.com/image/fetch/$s_!hIcp!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 1272w, https://substackcdn.com/image/fetch/$s_!hIcp!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!hIcp!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png" width="376" height="300.11009174311926" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:348,&quot;width&quot;:436,&quot;resizeWidth&quot;:376,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!hIcp!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 424w, https://substackcdn.com/image/fetch/$s_!hIcp!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 848w, https://substackcdn.com/image/fetch/$s_!hIcp!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 1272w, https://substackcdn.com/image/fetch/$s_!hIcp!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F548247d5-94b2-42fd-ad83-ad02ded75a36_436x348.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div></li></ul><ul><li><p>Vision API output:</p><ul><li><p>Text: &#8220;Sample Document Name Date Signature End Of Sample Document&#8221;</p></li><li><p>Metadata word coordinates: Sample &lt;0, 0&gt;, Document &lt;9, 0&gt; , Name &lt;0, 4&gt; &#8230;</p></li></ul></li><li><p>Object detection model output:</p><ul><li><p>Coordinates: BOX_1 &lt;6, 4&gt;, BOX_2 &lt;21 ,4&gt;, BOX_3 &lt;0 ,7&gt;</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!IsZg!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!IsZg!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 424w, https://substackcdn.com/image/fetch/$s_!IsZg!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 848w, https://substackcdn.com/image/fetch/$s_!IsZg!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 1272w, https://substackcdn.com/image/fetch/$s_!IsZg!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!IsZg!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png" width="390" height="313.09859154929575" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:342,&quot;width&quot;:426,&quot;resizeWidth&quot;:390,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!IsZg!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 424w, https://substackcdn.com/image/fetch/$s_!IsZg!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 848w, https://substackcdn.com/image/fetch/$s_!IsZg!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 1272w, https://substackcdn.com/image/fetch/$s_!IsZg!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17e27394-59af-4d6a-8de9-d0fb2d8125ee_426x342.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div></li></ul></li></ul><ul><li><p>Modified text using coordinates from both outputs:&nbsp;</p><ul><li><p>&#8220;Sample Document Name INPUT_BOX Date INPUT_BOX Signature INPUT_BOX End Of Sample Document&#8221;</p></li></ul></li><li><p>Entity Extraction model output when providing the modified text:</p><ul><li><p>INPUT_BOX [0]=name,&nbsp; INPUT_BOX [1]=date,&nbsp; INPUT_BOX [2]=signature&nbsp;&nbsp;</p></li></ul></li></ul><p>In order to train the object detection and entity extractions model, we had to first obtain high-quality labeled data. To do this, we collaborated with a data annotation company that utilized human annotators to label thousands of document images. The annotators drew bounding boxes around the input boxes and selected input types for these, providing the training data our model needed to learn. Once the labeled data was obtained, it can be loaded into the managed AutoML Image service with ease, allowing for the training of an object detection model. The default settings and robust capabilities of the AutoML managed service provided us with an effective model for this task. Initially, before we used professional help, we encountered some issues with the model, but upon realizing that the issue was with the training data which had some inconsistent annotations, we enlisted the services of a professional data annotation vendor. This led to an improvement in the quality of the training data and subsequently, an improvement in the performance of the model from the AutoML service. </p><p>Similarly, we utilized the input type annotations to train the entity extraction model using the AutoML Text service, which also resulted in the managed service automatically learning a model that effectively addressed this task. </p><p>While there are multiple companies that offer labeling services for machine learning projects, I highly recommend <a href="https://www.sama.com/">Sama</a> due to their commitment to hiring individuals living below the poverty line in developing countries, primarily in Africa, and helping them improve their livelihoods. I have first-hand experience with Sama as my spouse worked there for several years. I had the opportunity to visit their centers, meet their agents, and read the <a href="https://www.sama.com/blog/rct-results-mit/">academic study</a> about their impact</p><h4>Impact</h4><p>As a result of this solution, the mobile application was successfully deployed and is now being utilized by hundreds of thousands of users globally to efficiently digitize and fill documents. The API has proven to be a valuable tool in streamlining the document digitization process.</p>]]></content:encoded></item><item><title><![CDATA[Navigate A/B Testing Pitfalls: Avoid False Positives]]></title><description><![CDATA[Learn how to avoid false positives in A/B testing by carefully selecting key variables and utilizing the appropriate p-value threshold.]]></description><link>https://www.datascienceisnotrocketscience.com/p/avoiding-false-positives-in-ab-testing</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/avoiding-false-positives-in-ab-testing</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Wed, 01 Feb 2023 12:49:02 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff6063ad2-d3ae-4ba4-b160-7e583b7f2787_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Ipc1!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Ipc1!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!Ipc1!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!Ipc1!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!Ipc1!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Ipc1!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png" width="518" height="518" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:518,&quot;bytes&quot;:1541635,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Ipc1!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!Ipc1!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!Ipc1!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!Ipc1!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F9102d2e7-a91b-4280-abe0-473cb1b12166_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>A/B testing is a widely employed statistical methodology used to compare the performance of two variations of a product or system, designated as version A and version B. This technique is used to determine which version performs better by showing one version to a subset of users while exposing the other version to a different subset of users. For example, an organization may wish to evaluate whether a redesigned website leads to an increase in sales compared to the previous design and conduct an A/B test to measure the conversion rate of each group.</p><p>One important statistical measure utilized in A/B testing is the "p-value". The p-value is used to determine the likelihood that any observed differences between the two groups being compared are the result of random chance rather than a real difference in performance. A low p-value (generally below 0.05) implies that the observed difference is statistically significant and is unlikely to be the result of chance. Conversely, a high p-value (above 0.05) suggests that the difference may not be statistically significant and could have occurred by chance. It's worth noting that the p-value threshold of 0.05 is widely used as a convention in many fields, but it's important to keep in mind that other p-value thresholds such as 0.01 or 0.1 may be utilized depending on the context and desired level of confidence.</p><p>For instance, let's consider a scenario where an organization wants to evaluate whether a new marketing campaign will be more effective in increasing sales compared to the current campaign. The organization decides to conduct an A/B test by exposing half of the customers to the new campaign while the other half is exposed to the current campaign. The organization measures six different effects: the number of clicks on the campaign's landing page, the number of purchases made, the average order value, customer satisfaction score, repeat purchase rate, and overall revenue. Utilizing a p-value of 0.05 to determine statistical significance, the organization finds that the new campaign performed significantly better than the current campaign in terms of repeat purchase rate, but there was no significant difference in the other five effects.</p><p>It's crucial to recognize that when conducting A/B tests and measuring multiple variables, the likelihood of observing any statistically significant difference due to random noise instead of a real difference increases. If we take the example above with a threshold of 0.05 for the p-value and the measurement of six variables, there is a 26.5% (1 - 0.95^6) chance of measuring a "significant" difference due to random noise in any experiment. Given that one out of every four experiments can yield "significant" differences due to random noise alone in this example, can we truly be certain that the new campaign is superior? I am not so sure. This probability increases to 51.2% when measuring 14 variables. To address this pitfall, it is essential to carefully choose and measure just a limited number of key variables that are directly related to the hypothesis of the experiment, as well as consider using a different p-value threshold depending on the context and desired level of confidence.</p><p>In conclusion, A/B testing is a powerful tool for evaluating the performance of two variations of a product or system. However, it is essential to understand the limitations of the p-value, the increasing probability of observing false positive results when measuring multiple variables, and the importance of selecting only relevant variables and choosing the appropriate p-value threshold. By doing so, organizations can ensure that their experiment results are accurate and their actions based on these are sound.</p>]]></content:encoded></item><item><title><![CDATA[Optimal Pricing for Revenue Maximization]]></title><description><![CDATA[Learn how our optimal pricing strategy, using a binary classification model and neural network ensemble, helped increase company revenue by millions.]]></description><link>https://www.datascienceisnotrocketscience.com/p/optimal-pricing</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/optimal-pricing</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Tue, 17 Jan 2023 19:08:41 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/h_600,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!2a0u!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!2a0u!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!2a0u!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!2a0u!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!2a0u!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!2a0u!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png" width="530" height="530" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:530,&quot;bytes&quot;:1589867,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!2a0u!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!2a0u!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!2a0u!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!2a0u!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F79bd40c3-0c89-4792-a209-57ff9c1f993a_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context&nbsp;</h4><p>We were working on a system that processes millions and millions of sales opportunities daily and needs to make optimal pricing decisions for each one. For each opportunity, the system needs to choose a price that will maximize revenue while considering the probability of the sale being completed at that price. As the price increases, the potential revenue from the transaction increases, but the likelihood of making the sale decreases.</p><h4>Objective</h4><p>The objective was to maximize revenue through smart pricing.&nbsp;</p><h4>Solution</h4><p>To achieve this, we first discretized the range of product prices by dividing them into X equally spaced price options. We then used various transaction attributes (such as details about the product, the buyer, and the market conditions) to build a binary classification model that would predict the likelihood of a sale being completed at a given price. The price, along with the transaction attributes, were used as features in the model.<br><br>One of the crucial elements of this project was obtaining reliable and unbiased data to accurately learn the true probabilities. Given that the system handles a vast number of transactions daily, we utilized a small random subset of these transactions, and we randomly selected one of the X different prices for each transaction. We then used this set of randomly chosen transactions and prices as the training data for our model.</p><p>After obtaining a clean dataset for training the model, we leveraged an <a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">AutoML managed service</a> to effortlessly learn a sophisticated yet precise neural network ensemble. This model generated accurate probabilities of closing a sale for any of the X prices in combination with the transaction attributes. For every new transaction, we generated X different sale probabilities, and then computed the expected revenue and selected the price that would maximize it (i.e. sale probability * potential revenue = expected revenue).</p><h4>Impact</h4><p>This straightforward solution allowed us to quickly implement the price optimizer in production and rapidly increase company revenue by millions of dollars.</p>]]></content:encoded></item><item><title><![CDATA[Online Ad Experiment: Comparing Ad Channels]]></title><description><![CDATA[Learn the results of a recent online ad experiment, comparing different ad channels for increasing subscribers to my publication.]]></description><link>https://www.datascienceisnotrocketscience.com/p/online-advertisement-experiment</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/online-advertisement-experiment</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Wed, 04 Jan 2023 01:27:35 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/h_600,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4cfefbe4-9c51-4af2-90e4-7c5153d29211_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!wDxV!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!wDxV!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!wDxV!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!wDxV!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!wDxV!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!wDxV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png" width="518" height="518" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/d78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:518,&quot;bytes&quot;:690112,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!wDxV!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!wDxV!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!wDxV!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!wDxV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fd78bf96a-9076-475f-b294-c3bc4d385e50_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>I recently ran a small online advertisement experiment with the goal of&nbsp; increasing the number of subscribers to this Substack publication, and also&nbsp;to write this post about the experience. I focused my experiments on two advertisement channels: LinkedIn and Google search.</p><p>LinkedIn was an obvious choice as this is the only social network where I (and also this publication) have presence and followers. It is also a social network focused on professional networking and professional topics which fully aligns with the essence of this publication.</p><p>I picked Google as well because it&#8217;s not targeting specific people based on their profile details and historical usage of a social network, but rather targeting anyone using a search engine who is trying to find very specific information on the Internet at that specific point in time. Google&#8217;s strategy is very different to targeting specific people in social media so I thought it would be interesting to try both channels and compare results.</p><p>Both of these advertisement channels have some key differences and similarities which I will try to explain below:</p><p><strong>Billing</strong></p><p>LinkedIn charges you for the total number of ad impressions served. </p><p>Google will charge you only for the clicks made on your ads regardless of how many impressions they end up serving.</p><p><strong>Campaign Optimization Goal</strong></p><p>LinkedIn will charge more per impression depending on your campaign goal (e.g. brand awareness, website traffic, etc.) They add different features to the ad depending on your goal (for example some campaign goals will add a &#8220;follow&#8221; button to the ad and some goals would not). However, in my experiments, picking any goal other than the most basic one had no impact on clicks or conversions but made the cost per impression (and the campaign in general) substantially more expensive without any benefits.</p><p>Google lets you optimize your campaign on clicks or conversions without any extra cost. To make my campaign optimize for conversions (my real advertisement goal) I needed to set up a Google Analytics account in Substack, link my Google Analytics account to my Google Ads account, and create a &#8220;goal&#8221; in Google Analytics to track every time it detected a &#8220;Signup&#8221; event in the website.&nbsp;</p><p><strong>Audience Targeting</strong></p><p>LinkedIn provides an automatic way to generate &#8220;Lookalike&#8221; audiences on their network using their proprietary data and models. I leveraged this feature using all the users who have interacted with <a href="https://www.linkedin.com/company/data-science-is-not-rocket-science">my publication&#8217;s LinkedIn page</a> as an input. Their service then created a big audience of users in their network who are &#8220;similar&#8221; to my input users. They don&#8217;t disclose much about how these models work but I&#8217;m sure they use some sort of user similarity scores based on their profile information and their behavior on the platform.</p><p>LinkedIn also has an &#8220;Insights Tag&#8221; that you can add to your website/publication directly and build the &#8220;lookalike&#8221; audiences based on the visitors to your website. However, I wasn&#8217;t able to try this feature as the only way to set this up in Substack is with Google Tag Manager and, unfortunately, Substack&#8217;s implementation of Google Tag Manager code is broken (I have reported this to Substack but they said that this is currently not a priority.)</p><p>To set the targeting in Google I provided some relevant search keywords related to my ad (I started with &#8220;learn data science&#8221;) but it also has an automatic feature that lets Google experiment and suggest other keywords that will work well with your ad. It does this as it&#8217;s running the campaign so I let it run for a couple of days to gather all their keyword recommendations and use those too for my campaign. They ended up recommending more than 300 more keywords that I added to the campaign (e.g. &#8220;ml and ai&#8221;, &#8220;be a data scientist&#8221;, &#8220;path to become data scientist&#8221;, etc.)</p><p><strong>Partner Ad Networks</strong></p><p>LinkedIn lets you expand the reach of your campaigns by targeting specific LinkedIn users outside of LinkedIn (by using display ads on other websites). They achieve this by partnering with ad networks and publishers (other websites) in the industry. The cost per impression is reduced substantially when your ads run there so I also experimented with this feature.</p><p>Google also lets you expand your campaigns with two types of partners. Search partners, who also operate websites that are search based (users inputting a query to find information), and ad network partners which lets them run your (display) ads in random websites around the world. The cost per click also decreases when your ads run with partners so I also experimented with these. The search partners made a lot of sense to me as the targeting mechanism is similar to Google search, but the ad network partners didn&#8217;t as we aren&#8217;t really targeting a specific user in those random websites like we are with LinkedIn. That said, Google still charges per click and not per impression in those random websites so I was still interested in trying it out.</p><p><strong>Ads</strong></p><p>In LinkedIn I set the campaign to use all the postings I&#8217;ve made on my publication's LinkedIn page as ads.</p><p>In Google I gave them a few headers and descriptions about my websites and they ran experiments to find the best combinations of these for different queries.</p><h2>Experiment results</h2><p><strong>Experiments</strong></p><ul><li><p><strong>LA</strong>: LinkedIn campaign leveraging ad network partners.</p></li><li><p><strong>L</strong>: LinkedIn campaign only (no partners)</p></li><li><p><strong>GSA</strong>: Google campaign leveraging search and ad network partners.</p></li><li><p><strong>GS</strong>: Google Campaign leveraging search partners.</p></li><li><p><strong>G</strong>: Google campaign only (no partners)</p></li></ul><p><strong>Metrics</strong></p><ul><li><p><strong>CPM</strong>: Cost per mille in dollars (cost per thousand impressions - industry standard)</p></li><li><p><strong>CTR</strong>: Click through rate or percentage of impressions that lead to a click.</p></li><li><p><strong>CPC</strong>: Cost per click in dollars</p></li><li><p><strong>SUR</strong>: Signup rate or percentage of users that sign up to the publication after clicking on the ad.</p></li><li><p><strong>CPS</strong>: Cost per signup in dollars.</p></li></ul><p>Except for Google search, it is common for all publishers (websites), social networks, etc, to charge based on impressions. As a data scientist with experience in ad tech I&#8217;m not thrilled about having to pay for impressions. Impressions will be rarely correlated to my campaign goals (publication signups in this case) and there is no guarantee all these impressions are really being optimized for my benefit or, even worse, that some nefarious entities or users in the complex advertisement network are committing <a href="https://www.datascienceisnotrocketscience.com/p/identifying-online-bots">ad fraud</a>.</p><p>Lets compare the experiment results using the CPM lens:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Y5Bh!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 424w, https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 848w, https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 1272w, https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png" width="318" height="254" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:254,&quot;width&quot;:318,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:64188,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 424w, https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 848w, https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 1272w, https://substackcdn.com/image/fetch/$s_!Y5Bh!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F947f7a1c-8b47-42b1-9d41-df0f2a9e5a81_318x254.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>GSA experiment gave us the lowest CPM compared to all others. It is also interesting to see that although the Google experiments seem to have lower CPM in general, there is not a clear winner platform-wise as LA has a lower CPM than G.</p><p>As discussed earlier, Google charges based on clicks which in my opinion is a more valuable metric for advertisers than impressions. Clicks can sometimes be somewhat correlated to the campaign goals but unfortunately clicks are still far from perfect. Clicks can still not be correlated at all with the goal in some campaigns and fraudsters can also abuse these.</p><p>Lets compare the experiment results using the CPC lens:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!kBu2!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!kBu2!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 424w, https://substackcdn.com/image/fetch/$s_!kBu2!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 848w, https://substackcdn.com/image/fetch/$s_!kBu2!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 1272w, https://substackcdn.com/image/fetch/$s_!kBu2!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!kBu2!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png" width="538" height="258" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:258,&quot;width&quot;:538,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:118633,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!kBu2!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 424w, https://substackcdn.com/image/fetch/$s_!kBu2!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 848w, https://substackcdn.com/image/fetch/$s_!kBu2!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 1272w, https://substackcdn.com/image/fetch/$s_!kBu2!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F2dca5dea-08da-4cbe-82fe-5cfb67eb0681_538x258.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>GSA is still the clear winner also when using this metric. Note that based on cost per clicks the Google platform is also a clear winner over the LinkedIn platform both in terms of cost (which is not surprising to me given LinkedIn charges per impression and Google per click) but also in percentage of users clicking on impressions. Another interesting thing to note is that G has the higher CPC for the Google experiments but G also has the higher click rate, which shows that impressions on the Google network are a lot more relevant to the users than impressions that show up on the partners. However, those impressions come at a higher click price whenever the user clicks on them.</p><p>Let&#8217;s now focus on signups, which is what I really care about for these experiments. I don&#8217;t care about impressions or clicks if these are not leading to signups at the end of the funnel.</p><p>Lets compare the experiment results using the CPS lens:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!PbSk!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!PbSk!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 424w, https://substackcdn.com/image/fetch/$s_!PbSk!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 848w, https://substackcdn.com/image/fetch/$s_!PbSk!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 1272w, https://substackcdn.com/image/fetch/$s_!PbSk!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!PbSk!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png" width="762" height="252" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:252,&quot;width&quot;:762,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:175187,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!PbSk!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 424w, https://substackcdn.com/image/fetch/$s_!PbSk!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 848w, https://substackcdn.com/image/fetch/$s_!PbSk!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 1272w, https://substackcdn.com/image/fetch/$s_!PbSk!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F964ee9dd-0052-42dc-b770-aedde8f7df07_762x252.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Note how GSA, our previous CPM and CPC winner, is now at the bottom of the Google experiments with GS now being the winner on cost per signup as well as signup rates. GS is closely followed by G. Also note how G and GS had a huge difference in CPM but in terms of signups they both perform very well (which is what I as the advertiser really care about).</p><p>GS is the clear winner here in terms of how much I need to pay per signup on this campaign. This makes sense to me as this experiment leverages search keyword&nbsp; targeting while also displaying some of the ads on cheaper (per click) search partners. The google advertiser network in GSA is not adding value and losing us money as I originally speculated.&nbsp;</p><p>Unfortunately, in my experiments, LinkedIn was way more expensive and much less effective than Google. I was also disappointed that my following on that social network didn&#8217;t grow much with this campaign. This is the social network where I have a presence and I assumed that some extra advertisement could easily expand that following. However, this was not the case.&nbsp;</p><p>On the other hand Google was a very effective channel. They had the lowest cost per click by far, the highest click through rate by far, and more importantly, substantially higher signup rate and substantially lower signup cost, which is all I really cared about. </p><p>I hope this article is helpful for when you are evaluating your own ad campaigns.</p>]]></content:encoded></item><item><title><![CDATA[Combating Ad Fraud with Advanced Bot Detection Solutions]]></title><description><![CDATA[Learn how our team improved a service to detect bots engaging in online ad fraud, discover our approach and the impact of our solution.]]></description><link>https://www.datascienceisnotrocketscience.com/p/identifying-online-bots</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/identifying-online-bots</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Tue, 20 Dec 2022 12:01:51 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/h_600,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!uBuQ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!uBuQ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!uBuQ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!uBuQ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!uBuQ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!uBuQ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png" width="522" height="522" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:522,&quot;bytes&quot;:1334099,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!uBuQ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!uBuQ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!uBuQ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!uBuQ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F04b46f39-4cb7-4e08-8ca2-c8dac5fbcb13_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context&nbsp;</h4><p>Improve a service to detect bots that engage in online ad fraud.</p><h4>Problem</h4><p>This problem is - for the most part - an unsupervised ML problem (e.g. we don&#8217;t have ground truth for bots). I like this project for this reason, and also because we ended up building different solutions that complemented each other. This allowed us to improve the solution over time.</p><h4>Objective</h4><p>Identify online bots. This is valuable for internet companies because it can substantially reduce the money that brands lose to ad fraud and also protect their services from malicious players without affecting their real users.</p><h4>Solution</h4><p>The first model our team developed was very simple and elegant. Some important things to understand: bots tend to be programmed to visit the same set of fraudulent websites (to commit ad fraud). And, these fraudulent websites are usually not visited by real human beings. This creates an interesting conditional probability distribution for those fraudulent websites.</p><p>First, we computed independent scores for all websites by calculating the probability that any random user will visit the site at any point of time. Popular websites visited by many users (e.g. facebook.com) would get a high score (this is the probability that any random user will visit it) and obscure websites would have a very low score (a random user is very unlikely to ever visit it). Then, for all pair of websites, we also computed the conditional probability of a random user visiting a website given they have already visited the other website in that pair.</p><p>Even though fraudulent websites will have very low probability of being visited by a random user, their conditional probability will be very high if those users have already visited other fraudulent websites before. These kinds of bots tend to visit mostly fraudulent websites so their browsing history will be heavily represented by website pairs like this one (pairs where each site has low scores independently but very high conditional probability when seen together).</p><p>For each user, we looked at all their browsing websites and also at all of their website pairs. If a user had mostly visited websites that had low probability scores but high conditional probability scores given most seen pairs, there was a very high chance that that user is actually a fraudulent bot. We encoded this logic into the data warehouse, and that helped us easily flag these users moving forward.</p><p>Initially, another easy way to catch bots was to look at browser type and version distributions between known legit websites and known fraudulent websites. Many bots are actually operating inside infected machines and use malware built-in browsers to do their fraudulent browsing under the hood, without the machine user even noticing this is even happening. By monitoring known fraudulent websites we could flag browser types and versions that seem suspicious compared to browsers types and versions seen in legit websites. However, fraudsters rapidly realized this was giving them up so they started faking (spoofing) their browser user agents to avoid being flagged so easily.</p><p>We then decided to start collecting a lot of information (hundreds of signals) using custom JavaScript code on those browsers. This was so we could detect browsers that were pretending to be some other browser that they were not. We initially started this project by hard-coding rules on those signals but very fast realized it was a management nightmare. We needed to keep up with new rules for new browser versions all the time. So, instead, we replaced that process with a machine learning model, where we pre-filtered browsing examples that we were very certain to be clean and trained a model to classify the browser type and version based on the hundreds of signals (features) that we were able to collect form each browser using our JavaScript probe. We then used an <a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">AutoML managed service</a> to train this model. Given how comprehensive the amount and diversity of signals we were able to collect from browsers running our code, the ML model managed to do an amazing job at predicting the browser type and version - with 100% accuracy - on the &#8220;clean browsing&#8221; hold out set. We then used the same model to predict the browser type and version in the wild and started flagging those browsers where the user agent didn&#8217;t match the predicted browser type or version.</p><p>Lastly, we also complemented the solution with one more model that helped us separate cloud computers that were being used legitimately to browse the Internet (e.g. a VPN service) from those being used fraudulently (e.g. bots using cloud VMs). For this use case, we got our hands on some labeled data to train our model. We found a big list of cloud IPs that we knew for sure were being used by VPN-like services and some other big list of cloud IPs that we were very confident we shouldn&#8217;t see any real human traffic on them (but we were still seeing it).</p><p>Both kinds of cloud IPs would show a high volume of browsing coming from those IPs but we were able to find a few features that helped us train an almost perfect model. We realized that cloud IPs for which no human traffic was expected would have many short lived non-overlapping cookie sessions (bots were likely clearing all cookies after every website visit), while VPN-like IPs would show many concurrent cookie sessions coming from the same IP with a unique distribution of how long different cookie sessions lasted (as real people were actually behind those sessions). By computing counts of total cookies, maximum number of concurrent cookies during some time window, and some metrics about the distribution of the cookie&#8217;s lifetime, we got an AutoML managed service to learn to differentiate these two kinds of Cloud IPs beyond the smaller sample we had labels for.</p><h4>Impact</h4><p>By combining many kinds of models to tackle this problem we were able to build one of the most effective bot detection solutions in the industry.</p>]]></content:encoded></item><item><title><![CDATA[Analyzing Customer Conversations with NLP]]></title><description><![CDATA[Discover how we leveraged NLP models to analyze thousands of customer service calls, identify common issues, and improve client satisfaction.]]></description><link>https://www.datascienceisnotrocketscience.com/p/human-conversation-analysis</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/human-conversation-analysis</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Mon, 05 Dec 2022 19:46:06 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/h_600,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!xJH5!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!xJH5!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!xJH5!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!xJH5!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!xJH5!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!xJH5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png" width="498" height="498" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:498,&quot;bytes&quot;:2009929,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!xJH5!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!xJH5!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!xJH5!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!xJH5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1b503681-80e6-4b6c-9888-22bbcc054dd2_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context &amp; Problem</h4><p>Customer service representatives answer calls from thousands of clients a day. The costumer success team wanted to understand how often different issues came up on those calls and what exactly was being said about those issues by the clients, and by the costumer service representatives who answered those calls.</p><p>The team needed an efficient and flexible tool to analyze thousands and thousands of recorded phone calls for different issues.&nbsp;</p><h4>Objective</h4><p>The objective of the project was to process recorded telephone calls to rapidly help identify different topics and issues throughout many conversations. This was needed to understand the prevalence of different issues, the nature of those issues, and how those issues were being handled.</p><h4>Solution</h4><p>To solve this problem we were able to leverage <a href="https://www.tensorflow.org/hub">TensorFlow Hub</a>, which is a free and public repository of pre-trained machine learning models. We were particularly interested in their natural language (text) models. They have several of them including some very interesting <a href="https://tfhub.dev/google/universal-sentence-encoder-multilingual">multilingual ones</a> that can <a href="https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture">embed</a> phrases from 16 or more different languages into the same embedding space (e.g. the same phrase in different languages ends up having very similar embeddings). These models are extremely easy to use, as you can just load them into Python and use them as a function that takes a phrase (string) as an input and outputs an embedding for that phrase. You can also make use of these models inside <a href="https://www.tensorflow.org/tfx/guide/serving">TensorFlow Serving</a> if needed (to easily speed up and scale inference).</p><p>For our specific problem, we set up a <a href="https://cloud.google.com/dataflow">managed Apache Beam</a> job that would run a few times a day to process all new calls. This process called a <a href="https://cloud.google.com/speech-to-text">managed cloud transcription service</a> that returned separate strings every time the service detected a new speaker or a long pause on the audio files. Our job was to then take each of those strings and run them through the pre-trained language model of choice, which outputted an embedding for that phrase. Then, we saved the transcript string, the call identifier, the start time of the phrase, the duration of the phrase, and the embedding of the phrase into a data warehouse.</p><p>Lastly, we let users define topics and issues of interest (by writing a description of the topic in plain english) and we used the same pre-trained model to convert those descriptions into embeddings. Once we have the topic embeddings and the transcript embeddings, we can easily use a zero shot model (basically embedding <a href="https://en.wikipedia.org/wiki/Cosine_similarity">cosine similarity</a>)&nbsp;coded directly inside the data warehouse to find all the phrases in the transcripts that were semantically similar to a topic of interest, and show these to the end user for their analysis.</p><h4>Impact</h4><p>The company was able to easily understand the true scale of different issues, and find out what clients were saying about specific issues and how representatives were responding to them. This lead to better issue prioritization and robust solutions to these common issues, which then reduced substantially the number of calls the team was receiving and increased client satisfaction over time.</p>]]></content:encoded></item><item><title><![CDATA[AutoML Stops Fraudulent Merchants]]></title><description><![CDATA[Learn how we identified online merchants selling prohibited products and shut them down to protect acquiring banks from fraudulent transactions.]]></description><link>https://www.datascienceisnotrocketscience.com/p/finding-dishonest-merchants</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/finding-dishonest-merchants</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Tue, 22 Nov 2022 05:23:18 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/h_600,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!ifEv!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!ifEv!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!ifEv!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!ifEv!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!ifEv!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!ifEv!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png" width="534" height="534" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:false,&quot;imageSize&quot;:&quot;normal&quot;,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:534,&quot;bytes&quot;:1495219,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!ifEv!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!ifEv!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!ifEv!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!ifEv!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1834abda-3b3e-4b39-b918-255dfb1a2b8f_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context&nbsp;</h4><p>The goal of this project was to protect acquiring banks - banks that provide credit card terminals to businesses - from nefarious merchants. These are merchants who use their online terminals to transact on products and services that are not allowed by the terms of the contract with their bank (e.g. counterfeit products, illegal substances, etc.)</p><h4>Problem</h4><p>The way these fraudulent merchants were operating is by sending bank applications misrepresenting the nature of their business in order to be approved. Once approved, they used the online terminals on their websites to sell prohibited products.&nbsp;</p><p>In order to crack down on this practice a team of analysts was going through all the merchant applications to verify that their listed website was not selling any prohibited product/services (and flagging those that were). In the early days it was very easy to detect thousands and thousands of these bogus applications by only going to their listed websites. But, as soon as they realized it was easy to monitor them, they started creating professional looking front websites in which they pretended to sell allowed products. However, their real intention once approved was to still use the online terminal on an undisclosed website that was breaking the terms.</p><h4>Objective</h4><p>To prevent nefarious merchants from using credit card terminals to sell prohibited products.&nbsp;</p><h4>Solution</h4><p>We had a rich labeled dataset of websites (several tens of thousands) that sold prohibited products from the early days. The data set consisted of a text document (text parsed from those websites with a crawler) and the different labels that our analysts provided when flagging them for prohibited products.</p><p>We were able to feed this data directly to an <a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">AutoML managed tool</a> that would automatically pre process all the text into 1-grams and 2-grams (lists of all words and list of all pairs of words) and will learn <a href="https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture">embeddings</a> for all these n-grams. It was also able to find the best neural network architecture and hyper parameters to train a very accurate multi class classification model. Once we had this model, we piggy backed on an open source project called <a href="https://commoncrawl.org/">Common Crawl</a> which periodically crawls the whole internet and saves the crawled data into a public cloud storage for anyone to read. We then spin up a managed <a href="https://beam.apache.org/">Apache Beam</a> job to process the common crawl data and run each crawled website text through our model. Lastly, we dropped all the URLs with predictions into a data warehouse for the analysts to query later.</p><p>We then armed all our analysts with credit cards that were set up to always be declined by the issuing bank. We did that to track the merchants and acquiring banks on the other side of the transactions. By using the scores in the warehouse (prioritizing those with higher scores for prohibited products) our analysts were able to quickly find thousands of online stores selling prohibited products and taking credit cards without their bank knowledge. We then attempted to make purchases in those sites in order to find which specific merchants were using their online terminals on websites they didn&#8217;t list on their applications. These merchants were later shut down for breaking the terms.</p><h4>Impact</h4><p>The main takeaway from this story is that because we already had a very rich training dataset and the Common Crawl data at our disposal, we were able to easily leverage a set of managed services to produce very good business results.  We only needed the initial model to have a low false positive rate on the top-scored websites, and the AutoML tool managed to do that very well. The scores from this model were then used to rank websites for analyst prioritization which allowed the analysts to focus almost exclusively on nefarious merchants and rapidly process and shut down thousands and thousands of them.</p>]]></content:encoded></item><item><title><![CDATA[How AI tools helped me create this data science post]]></title><description><![CDATA[Reveal how AI tools streamlined creating this data science post. Generate original images, write Python code, and overcome writer's block quickly.]]></description><link>https://www.datascienceisnotrocketscience.com/p/how-did-ai-help-create-this-post</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/how-did-ai-help-create-this-post</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Mon, 07 Nov 2022 09:24:41 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!lblV!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!lblV!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!lblV!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!lblV!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!lblV!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!lblV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png" width="546" height="546" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:546,&quot;bytes&quot;:1300660,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!lblV!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!lblV!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!lblV!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!lblV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F3d07b7e3-c14c-46ab-9421-d16ae695ae96_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>For this post, I am testing three new out-of-the box AI tools that will help me easily and quickly put together a complete data science post for my publication.</p><p>The first tool I'll introduce is <a href="https://openai.com/dall-e-2/">DALL&#183;E 2 from OpenAI</a>. This new AI system can create photorealistic images and artwork by simply reading a description given in natural language.</p><p>So far, I've used this tool to create all illustrations for <a href="https://www.datascienceisnotrocketscience.com/p/coming-soon">my Substack posts</a>, including this one.</p><p>With this tool, you can create art by simply typing a description of the image you want to generate. For example, for the illustration in this article, I typed out the following:</p><p><em>&#8220;Three robots, one drawing a painting, one writing on a notepad, and one playing with dice&#8221;</em></p><p>The tool produced these four image options from my prompt:</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!DDvw!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!DDvw!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 424w, https://substackcdn.com/image/fetch/$s_!DDvw!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 848w, https://substackcdn.com/image/fetch/$s_!DDvw!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 1272w, https://substackcdn.com/image/fetch/$s_!DDvw!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!DDvw!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png" width="1456" height="362" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/ab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:362,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!DDvw!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 424w, https://substackcdn.com/image/fetch/$s_!DDvw!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 848w, https://substackcdn.com/image/fetch/$s_!DDvw!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 1272w, https://substackcdn.com/image/fetch/$s_!DDvw!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fab1658ed-aa28-48be-85a9-72ab9a3d9fec_1600x398.png 1456w" sizes="100vw"></picture><div></div></div></a></figure></div><p>While I liked the third option best, it had a few issues. The drawing included four robots instead of three. Additionally, none of them appeared to be playing with dice, and it wasn't evident which robot was writing and which one was drawing.</p><p>The tool also includes an editing function that allows you to delete certain parts of any illustrations and change the prompt. This generate 4 more options based on your edits.</p><p>I deleted the following from my favorite illustration:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!4F7T!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!4F7T!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 424w, https://substackcdn.com/image/fetch/$s_!4F7T!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 848w, https://substackcdn.com/image/fetch/$s_!4F7T!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 1272w, https://substackcdn.com/image/fetch/$s_!4F7T!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!4F7T!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png" width="302" height="389.87005649717514" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/ceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:914,&quot;width&quot;:708,&quot;resizeWidth&quot;:302,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!4F7T!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 424w, https://substackcdn.com/image/fetch/$s_!4F7T!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 848w, https://substackcdn.com/image/fetch/$s_!4F7T!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 1272w, https://substackcdn.com/image/fetch/$s_!4F7T!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fceb7b8a0-c4b0-4bc4-ad7b-43fc5a955c6f_708x914.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>And modified the description to say:</p><p><em>&#8220;A robot writing in the bottom left, a robot painting on the bottom right, and a robot rolling dice in the back&#8221;</em></p><p>The following 4 options were generated by the tool:</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://substackcdn.com/image/fetch/$s_!7PsH!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!7PsH!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 424w, https://substackcdn.com/image/fetch/$s_!7PsH!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 848w, https://substackcdn.com/image/fetch/$s_!7PsH!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 1272w, https://substackcdn.com/image/fetch/$s_!7PsH!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!7PsH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png" width="1456" height="350" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:350,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!7PsH!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 424w, https://substackcdn.com/image/fetch/$s_!7PsH!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 848w, https://substackcdn.com/image/fetch/$s_!7PsH!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 1272w, https://substackcdn.com/image/fetch/$s_!7PsH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F0d3b7035-98c6-4c6f-b15b-d58b6b289b4a_1600x385.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><p>The tool&#8217;s third option was my favorite, even though it added back a fourth robot. At least the two robots in the back are now clearly playing with dice, and with the two robots in the front it is now clear which one is writing and which one is painting. I ended up using this as the illustration for this post.</p><p>The second tool on my list is <a href="https://github.com/features/copilot">GitHub Copilot</a>. It's a fantastic time-saver that allows you to code in multiple programming languages, including Python, by simply using natural language prompts.</p><p>In a post on my publication about <a href="https://www.datascienceisnotrocketscience.com/p/what-is-the-probability-that-mexico">the probability of earthquakes in Mexico</a>, I used some basic python code to run simulations in order to answer the question posed by the article. For this new post, I want to use GitHub copilot to write the python code. This tool will automatically translate my English language instructions into working python code.</p><p>I started by creating an empty file called &#8220;simulation.py&#8221; and typed the following comments:</p><p><code># sample 64 days from 1 to 365<br># and assess if any day was sampled three or more times<br># if so, return true<br># If not, return false</code></p><p>As soon as I hit Enter, the tool suggested to add the following Python code to the file:</p><p><code>def simulation():</code></p><p><code>&nbsp;&nbsp;&nbsp;days = np.random.choice(np.arange(1, 366), size=64, replace=True)</code></p><p><code>&nbsp;&nbsp;&nbsp;return np.any(np.bincount(days) &gt;= 3)</code></p><p>I was impressed by this tool and accepted the recommended code, but I didn't know that it was just getting started. When I pressed enter again after the tool generated its code, it automatically suggested adding these new comments:</p><p><code># Path: main.py<br># run simulation 10000 times<br># and calculate the probability<br># that at least one day was sampled three or more times</code></p><p>The comments impressed me as they were exactly what I wanted my program to do next--I didn't even have to write them. The tool even accurately guessed that I wanted to run the simulation ten thousand times. It was quite spooky.</p><p>I hit enter, and the tool immediately created the rest of the code it required from its own comments.</p><p><code>def main():</code></p><p><code>&nbsp;&nbsp;&nbsp;simulations = [simulation() for _ in range(10000)]</code></p><p><code>&nbsp;&nbsp;&nbsp;print(np.mean(simulations))</code></p><p><code>if __name__ == '__main__':</code></p><p><code>&nbsp;&nbsp;&nbsp;main()</code></p><p>Running that code produced the same results as my original code, except it was quicker to write and neater. All I did was type a few short comments and then press enter repeatedly to accept all the recommendations--the whole process only took me less than 2 minutes.</p><p>In addition to the aforementioned tools, I'm also using <a href="https://www.jasper.ai/">Jasper</a> to help create this post. For those who don't know, Jasper is a high quality AI copywriting tool that eliminates creative blocks so you can write original content faster.</p><p>I hastily typed this article into their tool and the results were a much better quality text. In fact, all of the text you've read in this post has been edited or rewritten by the tool.</p><p>For example, here is the original text for this last paragraph that the tool completely rewrote:</p><p><em>"The last tool I&#8217;m using to produce this post is called Jasper. Jasper is a high quality AI copywriting tool that helps break through creative blocks to create amazing, original content faster. The text for this article I quickly and carelessly typed into their tool and got the tool to edit and rewrite the text in much better shape. All the text you&#8217;ve read so far for this post was edited and sometimes fully rewritten by the tool"</em></p><p>In less than two hours, I learned how to use three different tools and produce the content for this post. It's fascinating that we've reached a point in AI technology where very good but not perfect vision and language models can be leveraged by tools that help people complete their tasks much faster and efficiently.</p>]]></content:encoded></item><item><title><![CDATA[Efficient Elevator Repairs Using Machine Learning]]></title><description><![CDATA[Discover how our ML system reduced elevator servicing costs and downtime by predicting the most likely failures.]]></description><link>https://www.datascienceisnotrocketscience.com/p/efficient-elevator-repairs</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/efficient-elevator-repairs</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Mon, 24 Oct 2022 20:16:50 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/h_600,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!VazF!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!VazF!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!VazF!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!VazF!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!VazF!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!VazF!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png" width="544" height="544" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/a8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:544,&quot;bytes&quot;:1571788,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!VazF!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!VazF!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!VazF!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!VazF!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8b7ef99-4d73-44fa-887e-511b14f2317d_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context&nbsp;</h4><p>We were working with a company that sells and then services elevators for buildings all across the world.</p><h4>Problem</h4><p>Elevator technicians are expensive to hire and their time is very valuable. When an elevator breaks down and the technician is called, oftentimes they won&#8217;t know what is really wrong with the elevator until they are onsite. Also, more often than not, they won&#8217;t have with them all the parts they need to fix it (as it is impossible for them to preemptively bring all possible spare parts with them on all of their visits). Hence, they will have to arrange to come back at a later time with the right parts to fix it (and do it soon as elevators need to be fixed soon per the terms of service).</p><h4>Objective</h4><p>To reduce elevator servicing costs and decrease downtime of broken elevators.</p><h4>Solution</h4><p>The company we were working with had implemented a system on many of their elevators that would send all elevator system errors to their cloud service. Initially, they thought these reports were noisy and useless as elevators produce dozens of error codes all the time, and most of the time, nothing is really wrong with the elevator.&nbsp;</p><p>However, they also had technician reports of what exactly was wrong with an elevator every time they had to go fix one.&nbsp;</p><p>By using the technician reports together with the error system logs, we were able to identify sequences of errors (sequences of length 2 - 5) that were significantly correlated with instances where specific problems happened. We then picked the top 200 of these sequences and created a training set with the technician reported problems together with the counts of times each of those pre-selected 200 sequences were in the logs for a few hours before the elevator was reported as broken. We then trained a multi class classification model (using an <a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">AutoML managed service</a>). Lastly, we made changes to the servicing process for the technicians to bring spare parts for the top 10 most likely failures - based on the model predictions - every time they went to check on a broken elevator.</p><h4>Impact</h4><p>We were able to substantially reduce the number of instances where the technicians -on their first visit - didn&#8217;t have the correct parts to fix the elevator with them. Hence, reduce servicing costs overall. Note that for this business solution to produce substantial savings, the multi class classification model didn&#8217;t really need to be very accurate on its top prediction, it only needed to be very good at making sure that any of the top 10 predictions was the correct one most of the times.</p>]]></content:encoded></item><item><title><![CDATA[Stopping Software Piracy: Using Simple Statistical Test]]></title><description><![CDATA[Learn how we stopped software piracy by introducing a statistical test that flags suspicious activity, reducing piracy practices and increasing revenue.]]></description><link>https://www.datascienceisnotrocketscience.com/p/cracking-down-on-software-piracy</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/cracking-down-on-software-piracy</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Wed, 12 Oct 2022 13:55:48 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!eY01!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!eY01!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!eY01!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!eY01!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!eY01!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!eY01!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png" width="552" height="552" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/21619178-7121-4b33-b586-ec229eec1817_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:552,&quot;bytes&quot;:1531587,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!eY01!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!eY01!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!eY01!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!eY01!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F21619178-7121-4b33-b586-ec229eec1817_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><h4>Context&nbsp;</h4><p>We were working with computer factories all over the world, and these factories were being handed rolls of certificates that they needed to include with the computers. These rolls were being produced using technology similar to the one used to make money. Each roll had a roll ID and each certificate inside a roll had a certificate ID. These were being used to track their life and usage across the supply chain.&nbsp;</p><p>The purpose of these certificates was for the end user to reactivate the software in case they needed to perform a factory reset on their device. Computers came from the factory with its software already activated, so most of the time these certificate codes were not used. However, a common exception was random hardware glitches that would require a factory reset.</p><h4>Problem</h4><p>The certificate codes were very valuable in the black market as they allowed people who didn&#8217;t purchase the software legally to activate pirated versions of it. This gap rapidly developed into a cat and mouse dynamic with some of the factory workers stealing codes with increasingly sophisticated methods, and the software company trying to stop the practice.&nbsp;</p><p>Initially the workers were writing the codes in pieces of paper as they were taking each certificate out of the roll and putting it into the hardware box. The software company would just quickly block activation certificates once they started seeing more than a few usages of a single certificate. But then the workers started copying more certificate numbers so none of them would need to be reused more than once or twice in the black market.&nbsp;</p><p>Later, the company was also able to identify rolls - with the roll IDs - that seemed to be fully compromised and flag the whole rolls. This way the certificates of those rolls couldn&#8217;t be abused anymore. This led to even more sophisticated code copying at the factories, where the workers stealing them would not steal all certificates from the same roll and would instead alternate between copying them or not as they were processing them.</p><h4>Objective &amp; Assumptions</h4><p>The objective of the project was to stop the piracy practice of stealing certificates.&nbsp;</p><p>We knew that legit factory resets do happen once in a while, but they were mostly due to &#8220;random&#8221; glitches in the hardware. We also assumed that humans stealing the codes were probably not using random number generators when choosing which certificates to steal (even if their code copy patterns had evolved to be more complex).</p><h4>Solution</h4><p>My manager introduced me to a statistical test called &#8220;runs test&#8221;. It is a simple test where we marked the certificates with a binary flag: if they have been used for reactivation or not. Then, we ordered the flag values into sequences of positive and negative runs - in the same order that the certificates came on each roll, where a run ends every time the consecutive certificate has a different flag value.</p><p>For example: A roll with 26 certificates and an activation sequence 0000<strong>111</strong>00<strong>1111</strong>000<strong>1</strong>00<strong>111</strong>0000 has 9 runs.</p><p>The test consists of estimating the expected number of runs based on how many positive and negative values were seen on the roll under the hypothesis that each element in the sequence is independently drawn from the same distribution. If the actual number of runs is a few standard deviations away from the expected value, we would then flag those rolls as suspicious.</p><p>The formulas are so simple that it was very easy to implement into the SQL data warehouse and set up - right in the warehouse - a monitoring process to flag any suspicious roll moving forward.</p><p>If you are interested in learning more about this test <a href="https://en.wikipedia.org/wiki/Wald&#8211;Wolfowitz_runs_test">this wikipedia article</a> can be a good starting point.</p><h4>Impact&nbsp;</h4><p>The new flagging mechanism worked like a charm. The company was able to find - very quickly - thousands of rolls that they didn&#8217;t know had been compromised. They were able to also monitor new rolls and quickly react moving forward. This had a very positive impact on revenue as piracy practices were massively reduced after this went to production.</p>]]></content:encoded></item><item><title><![CDATA[Cloud Managed Services & Favorite Machine Learning Tools]]></title><description><![CDATA[Discover how cloud managed services can improve your machine learning infrastructure. Explore top managed services for ML, plus new promising technologies.]]></description><link>https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Mon, 03 Oct 2022 21:02:57 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e1888f-ebf8-4d19-9a3f-a307870cee89_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!MGZe!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!MGZe!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!MGZe!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!MGZe!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!MGZe!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!MGZe!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png" width="422" height="422" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:422,&quot;bytes&quot;:1784129,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!MGZe!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!MGZe!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!MGZe!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!MGZe!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b1e4840-8935-4f7a-9a73-dffe646417be_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>As someone who's passionate about data science, I've discovered that one of the best ways to reduce my team's dependence on others is to utilize cloud managed services for our machine learning infrastructure. <a href="https://www.datascienceisnotrocketscience.com/p/what-is-truly-important-to-be-a-successful">In a previous post</a>, I mentioned the significance of enhancing software engineering skills within the team. However, I believe that deploying clusters, monitoring virtual machines, or writing intricate parallelization logic shouldn't be a priority when we have the option to utilize managed services that can do all of this for us. It's important to always be on the lookout for ways to enhance our work and make it more efficient.</p><p>Personally, I find Google Cloud Platform (GCP) to be the best cloud provider for machine learning work. Although I have experience building solutions on various platforms such as AWS, Azure, and on-premise Hive/Spark, GCP remains my top choice. Below is a list of my favorite managed services within GCP. For those who find some of the concepts difficult to grasp, there are plenty of blog posts and documentation available for further exploration.</p><ol><li><p><strong><a href="https://cloud.google.com/vertex-ai/docs/beginner/beginners-guide/">Vertex AI AutoML</a></strong>. This managed service specializes in scaling automatic model training, feature engineering, hyper parameter tuning, and neural architecture search. In my experience with various AutoML tools, such as H2O Driverless, Turi (prior to its acquisition by Apple), Azure AutoML, and Sage Maker, Vertex AI outperforms them in terms of ease of use, scalability, infrastructure reliability, accuracy, and model flexibility in terms of utilization.</p></li><li><p><strong><a href="https://cloud.google.com/vertex-ai/docs/pipelines/introduction">Vertex AI pipelines</a></strong>. This managed service allows end-users to create complex orchestration pipelines to train and deploy new models on a schedule. The creation of pipeline components is achieved through simple Python functions, which can then be mixed, matched, and reused to generate complex execution graphs. The managed service handles the execution of these pipelines on managed compute infrastructure, where each component of any pipeline runs in its own separate Docker image instance.</p></li><li><p><strong><a href="https://cloud.google.com/bigquery-ml/docs">Big Query</a></strong>. A managed data warehouse (SQL) that separates storage from compute (similar to Snowflake) to scale significantly. This service has in-built scalable ML models (such as k-means, matrix factorization, simple regression) and robust integration with Vertex AI and DataFlow.</p></li><li><p><strong><a href="https://cloud.google.com/dataflow">DataFlow</a></strong>. A managed service for Apache Beam, which aids in defining complex data processing pipelines with simple Python code. The managed service then scales and parallelizes these pipelines, without end-users having to be concerned with the complexities of scaling and parallelizing their job, or the compute clusters behind their job.</p></li><li><p><strong><a href="https://www.tensorflow.org/hub">TensorFlow Hub</a></strong>. This repository offers free trained models for the transformation of text or image data into powerful <a href="https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture">embeddings</a> that can be used as features in individual models (transfer learning).</p></li></ol><p>In conclusion, cloud managed services have revolutionized the way data scientists work with machine learning infrastructure. The use of managed services can simplify and accelerate the development of new Data Science products.</p><p></p>]]></content:encoded></item><item><title><![CDATA[Tips for Succeeding in Data Science without a PhD]]></title><description><![CDATA[Discover the key skills for successful Data Science without a PhD. Problem framing, statistical thinking, and software engineering are crucial for success.]]></description><link>https://www.datascienceisnotrocketscience.com/p/what-is-truly-important-to-be-a-successful</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/what-is-truly-important-to-be-a-successful</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Mon, 26 Sep 2022 06:23:02 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!m1GQ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!m1GQ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!m1GQ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!m1GQ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!m1GQ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!m1GQ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png" width="548" height="548" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/cc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:548,&quot;bytes&quot;:1836489,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!m1GQ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!m1GQ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!m1GQ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!m1GQ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2Fcc228fd9-8afb-49b9-a996-73dacad49941_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Throughout my time in the world of data science and machine learning, I've had the pleasure of speaking with many people who are fascinated by this field. However, I've noticed that a lot of individuals become overwhelmed and doubtful about pursuing it after a short time. They often feel discouraged when they encounter the complicated papers on advanced neural network architectures and discover that those engaged in this kind of work generally possess highly specialized doctoral degrees, extensive education, and significant experience. As a result, they see the idea of acquiring the necessary knowledge and experience as daunting and almost impossible.</p><p>However, it's important to note that a complete understanding of every paper is not necessary to enter the field, and neither is a PhD. Advanced machine learning work is fascinating, and it's certainly going to be an influential factor in the future of the field, but the majority of Data Science teams - whether in large or small companies - don't require that level of specialization. It's not necessary to fully comprehend every paper to be effective in your day-to-day work.</p><p>There are several key factors that are much more important in becoming a successful data scientist, including:</p><ol><li><p>Maintaining a curious and inquisitive attitude towards the business problems you are trying to solve. You'll need to learn how to clearly frame business problems as data science problems to tackle them effectively later. Additionally, it's crucial to identify the actual problem that needs to be solved - often, what the business thinks they want to solve isn't actually the issue. By identifying the correct problem from the beginning of the project, you'll be able to build successful and transformative solutions to real-world problems.</p></li><li><p>Cultivating a statistical and analytical mindset in everything you do, from how you collect data to how you frame problems. It's essential to be mindful of any biases that may exist in either the data or the framing of the problem. Additionally, be careful to avoid data leakage and ensure that you're assessing solutions based on what is truly valuable for the specific business problem.</p></li><li><p>Developing your software engineering skills, even if you don't have a computer science background. This means learning to code effectively in Python and SQL, and writing clean and decoupled code that is easily reusable and maintainable. Successful data science teams are those that can deploy their business solutions to production without depending on other teams. If you're new to this area, don't worry - the other two skills are more important, and you'll have the opportunity to learn and practice as you work.</p></li></ol><p>Above all, it's important to enjoy the journey of learning and growing in Data Science. Don't be too hard on yourself and remember to stay curious and open to new challenges. By cultivating the skills mentioned above and maintaining a positive attitude, you'll be well on your way to a fulfilling career in Data Science.</p>]]></content:encoded></item><item><title><![CDATA[Three Major Earthquakes in Mexico: Exploring the Probability]]></title><description><![CDATA[On September 19th, Mexico experienced its third major earthquake. But what is the probability of this happening? Explore the data and find out.]]></description><link>https://www.datascienceisnotrocketscience.com/p/what-is-the-probability-that-mexico</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/what-is-the-probability-that-mexico</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Wed, 21 Sep 2022 18:45:32 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!ZL3K!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!ZL3K!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!ZL3K!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!ZL3K!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!ZL3K!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!ZL3K!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png" width="448" height="448" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:448,&quot;bytes&quot;:1819179,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!ZL3K!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!ZL3K!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!ZL3K!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!ZL3K!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F5a65916d-90ef-42b7-be4f-23f15fa4732d_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Mexico had a major earthquake, again, on September 19th. This is the third major earthquake on this specific date (<a href="https://www.nytimes.com/2022/09/19/world/americas/mexico-earthquake-september-19.html">Earthquake Shakes Mexico on Anniversary of 2 Major Temblors</a>). </p><p>Many people in the country are in awe of this, asking themselves how could something like this happen. </p><p>In fact, the probability of three major earthquakes (over 7.0 in richter scale) happening on the same date is not as low as you would think. <strong>It is actually about 25% likely to happen</strong>. This is the same probability as getting two consecutive "heads" when tossing a coin twice.</p><p>I was able to estimate this probability with a simple Python script. I simulated (100K times) picking random dates for the 64 major earthquakes we have had in Mexico (<a href="https://en.wikipedia.org/wiki/List_of_earthquakes_in_Mexico">according to Wikipedia</a>).</p><p>But why do many people think such an event is less likely than it really is? <br>The <a href="https://en.wikipedia.org/wiki/Birthday_problem">birthday paradox</a> has a similar effect. You only need 23 random people to have a 50% chance of having two of them have the exact same birthday, but most people will not think this is as likely. </p><p>A friend of mine pointed out that two of the three earthquakes had substantial casualties. About 30% of major earthquakes in Mexico have had substantial casualties <a href="https://en.wikipedia.org/wiki/List_of_earthquakes_in_Mexico">according to wikipedia</a> (defined as having 25 or more casualties). I updated my script and ran a second simulation to compute the probability of having at least 3 major earthquakes on the same date, where at least 2 of those had substantial casualties. I did this by picking random dates and then picking a wighted random flag with 30% of chances of that flag being true. </p><p><strong>The probability of that happening</strong> - of at least 3 major earthquakes in which at least  2 of them had substantial casualties on the exact same day -<strong> was still higher than 7%.</strong> This is several thousand times more likely than my wife winning a raffle for a fully paid trip to Maui in the Four Seasons (which she won), or her winning a raffle for a free Safari trip to South Africa (which she also won!). </p><p>Sometimes people, including data scientist, need to challenge their own pre-conceptions and biases.</p>]]></content:encoded></item><item><title><![CDATA[My Journey to Becoming a Data Scientist]]></title><description><![CDATA[My journey from a software developer to a data scientist with over a decade of experience. Learn about my mentorship and diverse experiences in the field.]]></description><link>https://www.datascienceisnotrocketscience.com/p/a-little-bit-about-me</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/a-little-bit-about-me</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Fri, 16 Sep 2022 14:48:47 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!DTW2!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!DTW2!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!DTW2!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!DTW2!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!DTW2!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!DTW2!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png" width="432" height="432" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:432,&quot;bytes&quot;:1550441,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!DTW2!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!DTW2!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!DTW2!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!DTW2!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F06deac90-6b28-45c2-947f-1872726608f0_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Hello, my name is Alan and I am an experienced data scientist and machine learning engineer with over 12 years of experience. Originally from Mexico City, I am passionate about using data to solve complex business problems and develop innovative data products.</p><p>My interest in computer science stemmed from my childhood dream of becoming a software developer. However, during my college internship, I worked with statisticians and was intrigued by the models they built and the process of constructing them. That experience led me to take as many machine learning, data science, and statistics courses as possible during my undergraduate and graduate computer science programs, as well as after graduation.</p><p>In the early years of my career, I made a conscious effort to move away from general software development and focus on data science. I was fortunate to land a job at Microsoft, where I was mentored by a seasoned statistician and had the opportunity to learn from experienced practitioners who emphasized the importance of fully understanding business problems before developing solutions.</p><p>Throughout my career, I have been fortunate to have diverse experiences, from working for small startups in various industries to consulting for some of the biggest companies in different fields. I have also had the privilege of working under exceptional leaders who empowered and supported me, teaching me the value of prioritizing working with great managers, leaders, and colleagues.</p><p>Outside of work, I enjoy traveling with my partner Carla, indulging in delicious cuisine, snowboarding, hiking, and embracing all that life has to offer. You can follow our adventures on <a href="https://www.instagram.com/whereonplanetearth/">Instagram</a> or on our blog <span class="mention-wrap" data-attrs="{&quot;name&quot;:&quot;Where on Planet Earth&quot;,&quot;id&quot;:1084628,&quot;type&quot;:&quot;pub&quot;,&quot;url&quot;:&quot;https://open.substack.com/pub/whereonplanetearth&quot;,&quot;photo_url&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/8c20c05a-ad33-4011-af8e-643059e9036f_500x500.png&quot;,&quot;uuid&quot;:&quot;5e2a8c11-7c03-428a-90ad-9a43e244ca69&quot;}" data-component-name="MentionToDOM"></span>.</p>]]></content:encoded></item><item><title><![CDATA[What this Data Science Publication is and is not]]></title><description><![CDATA[Gain insights from a data scientist's career and learn simple solutions for business problems. Connect for advisory opportunities.]]></description><link>https://www.datascienceisnotrocketscience.com/p/what-this-publication-is-and-is-not</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/what-this-publication-is-and-is-not</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Fri, 16 Sep 2022 14:41:39 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!CoHe!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!CoHe!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!CoHe!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!CoHe!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!CoHe!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!CoHe!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png" width="462" height="462" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:462,&quot;bytes&quot;:1480057,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!CoHe!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!CoHe!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!CoHe!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!CoHe!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F1aea0fa4-640c-4d47-a30a-f73d57272a92_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This publication is a compilation of business problems that I have tackled throughout my career, with a focus on simple solutions. Its purpose is to share my experiences, thoughts, and high-level recommendations with those interested in pursuing a career in this field.</p><p>While this material does not delve deeply into implementation details or the use of recommended tools, ample documentation is available online.</p><p>My objectives in creating this material are threefold, and I hope to achieve at least one of the following:</p><ul><li><p>Debunk the notion that a PhD-level expertise in neural networks is necessary to succeed as a data scientist and solve significant business problems.</p></li><li><p>Share simple solutions that have successfully addressed major business problems during my career, with the aim of inspiring others.</p></li><li><p>Publish this work in the hope of transitioning from a traditional corporate career to a more flexible lifestyle that combines my passion for long-term travel with my passion for data science and machine learning. I am currently available for advisory opportunities with startups or large companies in this field. If you believe that we can collaborate, please do not hesitate to connect with me on LinkedIn (@alankrumholz) or send me a message.</p><p></p></li></ul><p><strong>Acknowledgments:</strong></p><p>I would like to express my sincere gratitude to my life partner, <span class="mention-wrap" data-attrs="{&quot;name&quot;:&quot;Carla Villoria&quot;,&quot;id&quot;:10735897,&quot;type&quot;:&quot;user&quot;,&quot;url&quot;:null,&quot;photo_url&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/a8954206-97c2-4fae-84c9-7a9191b203f8_2049x2213.jpeg&quot;,&quot;uuid&quot;:&quot;8407f32a-1334-4f91-90c9-b8caadd7525b&quot;}" data-component-name="MentionToDOM"></span>, for her valuable editing contributions and unwavering support and encouragement towards my pursuits.</p><p>I would also like to extend my appreciation to the exceptional mentors, leaders, and data scientists who have been instrumental in propelling my career and collaborating with me on the various projects that will be discussed in this publication.</p>]]></content:encoded></item><item><title><![CDATA[Data Science: Solutions for Business Problems]]></title><description><![CDATA[Solve real business problems with Data Science and Machine Learning. Discover key skills, business solutions, and more in this reader-supported publication.]]></description><link>https://www.datascienceisnotrocketscience.com/p/coming-soon</link><guid isPermaLink="false">https://www.datascienceisnotrocketscience.com/p/coming-soon</guid><dc:creator><![CDATA[Alan Krumholz]]></dc:creator><pubDate>Fri, 16 Sep 2022 06:07:03 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!f6EH!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!f6EH!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!f6EH!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!f6EH!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!f6EH!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!f6EH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png" width="548" height="548" data-attrs="{&quot;src&quot;:&quot;https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:548,&quot;bytes&quot;:1480057,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!f6EH!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!f6EH!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!f6EH!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!f6EH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F865e849b-5ecf-43c8-b06a-fbfbf7ed2059_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p><strong>This is Data Science is not Rocket Science</strong>, a small publication about simple and effective solutions to real business problems.</p><h2>Publication Content</h2><h3>Introduction</h3><ul><li><p><a href="https://www.datascienceisnotrocketscience.com/p/what-this-publication-is-and-is-not">What this Data Science Publication is and is not</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/a-little-bit-about-me">My Journey to Becoming a Data Scientist</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/what-is-truly-important-to-be-a-successful">Tips for Succeeding in Data Science without a PhD</a></p></li></ul><h3>Business Solutions Series</h3><p>The Business Solutions Series is a compilation of solutions to various business challenges that I have encountered throughout my professional journey.</p><ul><li><p><a href="https://www.datascienceisnotrocketscience.com/p/cracking-down-on-software-piracy">Stopping Software Piracy: Using Simple Statistical Test</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/efficient-elevator-repairs">Efficient Elevator Repairs Using Machine Learning</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/finding-dishonest-merchants">AutoML Stops Fraudulent Merchants</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/human-conversation-analysis">Analyzing Customer Conversations with NLP</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/identifying-online-bots">Combating Ad Fraud with Advanced Bot Detection Solutions</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/optimal-pricing">Optimal Pricing for Revenue Maximization</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/frictionless-document-digitization">Computer Vision: Frictionless Mobile Document Digitization</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/accurately-forecasting-sales-months">Accurately Forecasting Sales Months in Advance with ML</a></p></li></ul><h3>Miscellaneous</h3><ul><li><p><a href="https://www.datascienceisnotrocketscience.com/p/making-the-case-for-cloud-managed">Cloud Managed Services &amp; Favorite Machine Learning Tools</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/what-is-the-probability-that-mexico">Three Major Earthquakes in Mexico: Exploring the Probability</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/how-did-ai-help-create-this-post">How AI tools helped me create this data science post</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/online-advertisement-experiment">Online Ad Experiment: Comparing Ad Channels</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/avoiding-false-positives-in-ab-testing">Navigate A/B Testing Pitfalls: Avoid False Positives</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/the-next-frontier-for-automl-learning">The Next Frontier for AutoML: Learning Custom Embeddings</a></p></li><li><p><a href="https://www.datascienceisnotrocketscience.com/p/importance-of-model-calibration-techniques">Importance of Model Calibration: Techniques &amp; Benefits</a></p></li></ul>]]></content:encoded></item></channel></rss>