08.散点图_淘宝-商品属性.md

# %% [markdown] cell-5e351e7966a3
## 🕐请从本地上传一个用电商记插件采集的新版淘宝搜索结果（含综合排序和销量排序）的Excel文件。[下载示例](https://www.dianshangji.cn/u/user5344/cfiles/browse/index?fid=3)
# %%--- [html] cell-a5b79fdf3c10
# properties:
#   run_on_load: true
# ---%%
请上传一个Excel文件：<input class="btn btn-primary" type="file" id="fileInput" multiple />
# %%--- [python] cell-f74e05782eef
# properties:
#   run_on_load: true
#   top_hidden: true
# ---%%
print("开始初始化数据分析和可视化的运行环境,请稍候……")
import micropip
await micropip.install("/pypi/openpyxl-3.1.5-py2.py3-none-any.whl")
await micropip.install("/pypi/pyfonts-0.0.2-py3-none-any.whl")
import pandas as pd
import pyodide
from pyfonts import load_font

# 文件的URL
file_url = '/assets/fonts/NotoSansSC-Regular.ttf'  # 请将此URL替换为实际文件的URL
# 本地保存路径
local_file_path = '/NotoSansSC-Regular.ttf'
# 下载文件并保存到本地
try:
	response = await pyodide.http.pyfetch(file_url)
	# Return the response body as a bytes object
	image_data = await response.bytes()
	with open(local_file_path, 'wb') as f:
		f.write(image_data)
	print(f"中文字体已成功下载并保存为: {local_file_path}")
except Exception as e:
	print(f"下载文件时出错: {e}")

font = load_font(font_path="/NotoSansSC-Regular.ttf")
print("数据分析和可视化的运行环境已经就绪。可以进行第1步了。")
# %%--- [javascript] cell-d28877a258f5
# properties:
#   run_on_load: true
# ---%%
const fileInput = document.getElementById('fileInput');

async function uploadFiles(){
  // 先创建一个文件名到对应保存文件名的映射
  const fileMapping = {
      '淘宝': '/taobao.xlsx'
  };
  
  // 遍历用户上传的所有文件
  for (let file of files) {
    const fileName = file.name.toLowerCase();
  
    // 判断文件名包含的关键词，并根据关键词确定保存文件的名称
    let pyodideFileName = null;
    for (let key in fileMapping) {
      if (fileName.includes(key.toLowerCase())) {
        pyodideFileName = fileMapping[key];
        break;
      }
    }
  
    // 如果文件名不符合要求
    if (!pyodideFileName) {
      alert("Each file must be named with one of these keywords: '淘宝'");
      return;
    }
  
    // 读取文件为 ArrayBuffer
    const arrayBuffer = await file.arrayBuffer();
    
    // 将 ArrayBuffer 转换为 Uint8Array
    const uint8Array = new Uint8Array(arrayBuffer);
    
    // 写入文件到 Pyodide 文件系统
    console.log(pyodideFileName, uint8Array.length + '字节');
    pyodide.FS.writeFile(pyodideFileName, uint8Array);
  }
  alert('淘宝数据文件已上传至Python内存文件。')
}
const files = fileInput.files;
if (files.length == 0) {
 	fileInput.addEventListener('change', uploadFiles);
    return;
}
uploadFiles();
# %% [markdown] cell-7bd9d56dbdb8
## 🕑数据清洗与预处理
# %% [python] cell-317b418f482f
import io
import re
from collections import defaultdict

# 读取一系列颜色值用于数据可视化的Preattentive Attributes
# See https://www.dianshangji.cn/u/user5344/cfiles/browse/index?fid=4
response = await pyodide.http.pyfetch('/assets/visualization/colors.csv')
# 将响应体作为文本读取
csv_str = await response.string()
# 使用pandas将CSV字符串转换为DataFrame
df_color = pd.read_csv(io.StringIO(csv_str))
print(df_color.head(5))
color_names = ",".join(df_color['Color'])
response = await pyodide.http.pyfetch('/assets/visualization/colors_cn.csv')
# 将响应体作为文本读取
csv_str = await response.string()
# 使用pandas将CSV字符串转换为DataFrame
df_color_cn = pd.read_csv(io.StringIO(csv_str))
print(df_color_cn.head(5))

print("开始读取内存中的Excel文件……")
# 读取Excel文件，跳过前两行（即从第三行开始读取），只保留“价格”、“月销量”和“属性”三个字段
excel_file_path = '/taobao.xlsx'

df_zonghe = pd.read_excel(excel_file_path, skiprows=2, usecols=['商品ID','价格','月销量', '属性'], sheet_name = "综合")
df_xiaoliang = pd.read_excel(excel_file_path, skiprows=2, usecols=['商品ID','价格','月销量', '属性'], sheet_name = "销量")

# 处理月销量字段的函数
def clean_month_sales(sales_str):
    if pd.isna(sales_str):
        return None
    
    sales_str = str(sales_str).strip()
    
    # 根据不同的格式估算月销量
    if "本月行业热销" in sales_str:
        return 1000
    elif "万+" in sales_str:
        match = re.search(r'(\d+)', sales_str)
        if match:
            return int(match.group(1)) * 10000  # 假设每万+表示的销量为数字乘以10000
    elif "+人" in sales_str:
        match = re.search(r'(\d+)', sales_str)
        if match:
            return int(match.group(1))  # 假设+人后跟的是具体的销量数字
    
    try:
        return int(sales_str)  # 尝试将字符串转为整数
    except ValueError:
        return None

def processdata(df):      
	df['月销量'] = df['月销量'].apply(clean_month_sales)      
    # 筛选出“价格”小于等于1000元的记录
	df = df[df['价格'] <= 1000]
	print("筛选出“价格”小于等于1000元的记录")
    #print(df.head(5))

	# 筛选出“属性”存在的记录
	df = df[df['属性'].apply(lambda x: isinstance(x, str))]

    # 筛选出“月销量”大于等于0，小于等于5000的记录
	df = df[df['月销量'].apply(lambda x: (1 <= x <= 5000))]
	print("筛选出“月销量”小于等于5000的记录")
    #print(df.head(5))  

    # 检查商品ID是否有重复记录
	if df['商品ID'].duplicated().any():
		print("存在重复的商品ID记录，正在去除重复条目...")
		df = df.drop_duplicates(subset=['商品ID'], keep='first')
	else:
		print("没有重复的商品ID记录。")
    
    # 打印前5条记录以检查结果
	print(df.head(5))
    
	print("有效记录总数：", len(df))

	return df

# 处理“月销量”字段
df_zonghe = processdata(df_zonghe)
df_xiaoliang = processdata(df_xiaoliang)

# 使用defaultdict来存储结果，使得可以自动处理不存在的键
attributeset = defaultdict(set)

attribute_weight = {}

def process_attributes(df, result, attribute_weight):
  data = df['属性']
    
  for line in data:
    if not isinstance(line, str):
      continue
    # 按照'|'分割每一行    
    items = line.split('|')
    for item in items:
      key, value = item.split(':')
      result[key].add(value)        
      if item in attribute_weight:
      	attribute_weight[item] += 1
      else:
        attribute_weight[item] = 1			
  return  

process_attributes(df_zonghe, attributeset, attribute_weight)
process_attributes(df_xiaoliang, attributeset, attribute_weight)

# 使用列表推导式生成要删除的键的列表
keys_to_delete = [k for k, v in attributeset.items() if len(v) == 1 or len(v) > 10]

# 遍历要删除的键的列表，并从 defaultdict 中删除这些键
for k in keys_to_delete:
    del attributeset[k]

# 打印结果，值以'|'分隔
attributeset_text = ''
for key, values in attributeset.items():
  sorted_values = sorted(values, key=lambda x: 0-attribute_weight[key + ':' + x])
  attributeset[key] = sorted_values
  print(f"{key}: { '|'.join(sorted_values) }")
  attributeset_text += '\n' + f"{key}: { '|'.join(sorted_values) }"

# 输出处理后的 defaultdict 来验证结果
print(attributeset)

# 创建一个字典，英文颜色名作为键，中文颜色名作为值
color_dict = dict(zip(df_color['Color'], df_color_cn['颜色名']))
print(color_dict)

import js

js.window.openNotebookAssistant('cell-916c6fd3ac93', 'python', '你是一名商业分析师，为了进行有效的数据分析，尤其是当考虑使用Preattentive Attributes（如颜色、大小、形状和长度）来代表数据时，选择那些能够显著区分商品特征且对用户购买决策有重大影响的属性尤为重要。你的任务是从如下描述商品的若干行属性名+冒号+属性值（以|字符作为分隔）中选出最有意义的10个属性名，每个属性名的所有属性值应限定在同一分类维度内。它们的属性值将用Preattentive Attributes代表，优先考虑颜色，其次考虑大小、形状和长度。你要为每一个属性的第一个属性名挑选最适合的颜色作为视觉特征。任一颜色名必须从这个颜色名集合中选择：' + color_names + '。\n你把选好的这些属性放在一个JSON对象中取名为selected_attributes，作为key，每个属性的第一个属性值及其颜色作为value，然后按照指定的格式书写，即一个属性值+冒号+一个相应的颜色名作为一对。你用python编程语言书写这个JSON对象。\n\n商品描述的属性数据如下：' + attributeset_text)
# %% [markdown] cell-e9ecfea97fd9
## 🕒选择商品属性和代表色
# %% [python] cell-916c6fd3ac93
selected_attributes = {
    "适用对象": "青年:Hermosa Pink",
    "基础风格": "青春流行:Corinthian Pink",
    "裤长": "长裤:Cameo Pink",
    "厚薄": "加绒加厚:Fawn",
    "腰型": "中腰:Light Brown Drab",
    "适用季节": "四季通用:Coral Red",
    "裤脚口款式": "直筒:Fresh Color",
    "颜色": "深色:Grenadine Pink",
    "面料功能": "抗皱:Eosine Pink",
    "版型": "宽松型:Spinel Red"
}
print(selected_attributes)
# %% [markdown] cell-14aeb311d93e
## 🕓分配各项属性颜色
# %% [python] cell-2a797c95a7a1
import colorsys

print(selected_attributes)

# 创建一个新的字典 selected_attributes_cn 来存储转换中文颜色名后的结果
selected_attributes_cn = {}

# 遍历 selected_attributes 的每一项
for key, value in selected_attributes.items():
    # 分割每项中的颜色名和中文描述  
    new_value = []
    hex_value = []    
    for item in value.split("|"):
        description, color_name = item.split(":")
        # 查找颜色对应的中文名称
        color_name_cn = color_dict.get(color_name, color_name)  # 如果没有找到对应的中文名，保持原名
        new_value.append(f"{description}:{color_name_cn}")
        color_hex = df_color[df_color['Color'] == color_name]['Hex']
        if color_hex.empty:
        	print(color_name)
        hex = color_hex.values[0]
        hex_value.append(f"{description}:{hex}")
            
    # 将转换后的值放入新的字典中
    selected_attributes_cn[key] = "|".join(new_value)
    
# 打印输出转换后的字典
print(selected_attributes_cn)

selected_attribute_hex = {}
colorset = {}

def hex_to_rgb(hex_color):
    """
    Convert a hexadecimal color string to an RGB tuple.
    
    :param hex_color: A hexadecimal color string (e.g., "#FFFFFF").
    :return: An RGB tuple.
    """
    hex_color = hex_color.lstrip('#')
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

def rgb_to_hex(rgb):
    """
    Convert an RGB tuple to a hexadecimal color string.
    
    :param rgb: An RGB tuple.
    :return: A hexadecimal color string.
    """
    return '#{:02x}{:02x}{:02x}'.format(*rgb)

def get_hsv_from_rgb(r, g, b):
    """Convert RGB to HSV."""
    return colorsys.rgb_to_hsv(r, g , b)

def generate_uniform_hues(base_hex, num_colors=4):
    """
    Given a hexadecimal color string, generates num_colors additional colors that are evenly distributed
    around the hue circle and returns them as hexadecimal strings.
    
    :param base_hex: A hexadecimal color string (e.g., "#FFFFFF").
    :param num_colors: Number of additional colors to generate.
    :return: A list of hexadecimal color strings.
    """
    # Convert hexadecimal string to RGB tuple
    base_rgb = hex_to_rgb(base_hex)
    r, g, b = [x / 255.0 for x in base_rgb]  # Normalize RGB values to [0, 1]
    h, s, v = get_hsv_from_rgb(r, g, b)

    # 任何颜色的饱和度都设置为1
    s = 1

    # Calculate step size for uniform distribution across the hue circle
    step = 360 // (num_colors + 1)

    # Generate new colors
    colors = []
    for i in range(0, num_colors):
        new_hue = (h * 360 + i * step) % 360  # Update hue angle
        
        new_rgb_normalized = colorsys.hsv_to_rgb(new_hue / 360.0, s, v)        
        new_rgb = tuple(round(v * 255) for v in new_rgb_normalized)  # Convert back to 0-255 scale
        
        colors.append(rgb_to_hex(new_rgb))  # Convert to hexadecimal and add to list

    return colors

for key, value in selected_attributes.items():
    attribute_value, color_name = value.split(':')
    color_hex = df_color[df_color['Color'] == color_name]['Hex'].values[0]
    attribute_values = attributeset[key]    
    attribute_colors = generate_uniform_hues(color_hex, len(attribute_values))    
    
    values = []
    colorset[key + ':' + attribute_value] = color_hex
    for i in range(0, len(attribute_values)):
        hex = attribute_colors[i]
        values.append(attribute_values[i] + ':' + hex)
        colorset[key + ':' + attribute_values[i]] = hex
    selected_attribute_hex[key] = "|".join(values)    

print(selected_attribute_hex)

# %% [markdown] cell-89d989a446f2
## 🕔绘制属性分布散点图
# %% [python] cell-073d44a9c7d3
import matplotlib.pyplot as plt
import numpy as np
import random

def drawscatter(df, caption, selected_attribute):	
	plt.figure(figsize=(10, 6))
	colors = []
	edgecolors = []	
	scatters = {
		selected_attribute + ':其他属性': {
			'c': 'none',
			'edgecolors': 'gray',
			'x': [],
			'y': []
		}
	}
	for row in df.itertuples():
		attribute = row.属性
		found = False
		for key, value in colorset.items():
			if key.find(selected_attribute) == 0 and attribute.find(key) >= 0:
				if not key in scatters:
					scatters[key] = {
						'c': value,
						'edgecolors': value,
						'x': [],
						'y': []
					}
				scatters[key]['x'].append(row.价格)
				scatters[key]['y'].append(row.月销量)
				
				found = True
				break
		if not found:
			key = selected_attribute + ':其他属性'
			scatters[key]['x'].append(row.价格)
			scatters[key]['y'].append(row.月销量)			
            	
	for key, value in scatters.items():		
		plt.scatter(value['x'], [y + 50 * random.random() for y in value['y']], c = value['c'], edgecolors = value['edgecolors'], alpha=0.5, s = 8, label = key.split(':')[1], linewidths=1)

	plt.title(caption + ' ' + selected_attribute + ' 价格 vs 月销量 散点图', font=font)
	plt.xlabel('价格', font=font)
	plt.ylabel('月销量', font=font)
	plt.grid(True)
	plt.legend(prop = font)
	plt.show()
	print(caption + " 散点图绘制完毕：", selected_attribute)

for attribute in selected_attributes:
	drawscatter(df_zonghe, '淘宝搜索综合排序', attribute)
	drawscatter(df_xiaoliang, '淘宝搜索销量排序', attribute)